From aa6a6a2d16c1e2e27e986936369959d70316199f Mon Sep 17 00:00:00 2001
From: Thomas Richter <tmricht@linux.ibm.com>
Date: Tue, 17 Feb 2026 14:14:56 +0100
Subject: [PATCH 001/131] perf parse-events: Fix big-endian 'overwrite' by
 writing correct union member

The "Read backward ring buffer" test crashes on big-endian (e.g. s390x)
due to a NULL dereference when the backward mmap path isn't enabled.

Reproducer:
  # ./perf test -F 'Read backward ring buffer'
  Segmentation fault (core dumped)
  # uname -m
  s390x
  #

Root cause:
get_config_terms() stores into evsel_config_term::val.val (u64) while later
code reads boolean fields such as evsel_config_term::val.overwrite.
On big-endian the 1-byte boolean is left-aligned, so writing
evsel_config_term::val.val = 1 is read back as
evsel_config_term::val.overwrite = 0,
leaving backward mmap disabled and a NULL map being used.

Store values in the union member that matches the term type, e.g.:
  /* for OVERWRITE */
  new_term->val.overwrite = 1;  /* not new_term->val.val = 1 */
to fix this. Improve add_config_term() and add two more parameters for
string and value. Function add_config_term() now creates a complete node
element of type evsel_config_term and handles all evsel_config_term::val
union members.

Impact:
Enables backward mmap on big-endian and prevents the crash.
No change on little-endian.

Output after:
 # ./perf test -Fv 44
 --- start ---
 Using CPUID IBM,9175,705,ME1,3.8,002f
 mmap size 1052672B
 mmap size 8192B
 ---- end ----
 44: Read backward ring buffer                         : Ok
 #

Fixes: 159ca97cd97c ("perf parse-events: Refactor get_config_terms() to remove macros")
Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
Reviewed-by: Jan Polensky <japo@linux.ibm.com>
Reviewed-by: James Clark <james.clark@linaro.org>
Acked-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/parse-events.c | 82 +++++++++++++++++++++++++++-------
 1 file changed, 65 insertions(+), 17 deletions(-)

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index b9efb296bba5..7b4629625b1e 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -1117,7 +1117,7 @@ static int config_attr(struct perf_event_attr *attr,
 
 static struct evsel_config_term *add_config_term(enum evsel_term_type type,
 						 struct list_head *head_terms,
-						 bool weak)
+						 bool weak, char *str, u64 val)
 {
 	struct evsel_config_term *t;
 
@@ -1128,8 +1128,62 @@ static struct evsel_config_term *add_config_term(enum evsel_term_type type,
 	INIT_LIST_HEAD(&t->list);
 	t->type = type;
 	t->weak	= weak;
-	list_add_tail(&t->list, head_terms);
 
+	switch (type) {
+	case EVSEL__CONFIG_TERM_PERIOD:
+	case EVSEL__CONFIG_TERM_FREQ:
+	case EVSEL__CONFIG_TERM_STACK_USER:
+	case EVSEL__CONFIG_TERM_USR_CHG_CONFIG:
+	case EVSEL__CONFIG_TERM_USR_CHG_CONFIG1:
+	case EVSEL__CONFIG_TERM_USR_CHG_CONFIG2:
+	case EVSEL__CONFIG_TERM_USR_CHG_CONFIG3:
+	case EVSEL__CONFIG_TERM_USR_CHG_CONFIG4:
+		t->val.val = val;
+		break;
+	case EVSEL__CONFIG_TERM_TIME:
+		t->val.time = val;
+		break;
+	case EVSEL__CONFIG_TERM_INHERIT:
+		t->val.inherit = val;
+		break;
+	case EVSEL__CONFIG_TERM_OVERWRITE:
+		t->val.overwrite = val;
+		break;
+	case EVSEL__CONFIG_TERM_MAX_STACK:
+		t->val.max_stack = val;
+		break;
+	case EVSEL__CONFIG_TERM_MAX_EVENTS:
+		t->val.max_events = val;
+		break;
+	case EVSEL__CONFIG_TERM_PERCORE:
+		t->val.percore = val;
+		break;
+	case EVSEL__CONFIG_TERM_AUX_OUTPUT:
+		t->val.aux_output = val;
+		break;
+	case EVSEL__CONFIG_TERM_AUX_SAMPLE_SIZE:
+		t->val.aux_sample_size = val;
+		break;
+	case EVSEL__CONFIG_TERM_CALLGRAPH:
+	case EVSEL__CONFIG_TERM_BRANCH:
+	case EVSEL__CONFIG_TERM_DRV_CFG:
+	case EVSEL__CONFIG_TERM_RATIO_TO_PREV:
+	case EVSEL__CONFIG_TERM_AUX_ACTION:
+		if (str) {
+			t->val.str = strdup(str);
+			if (!t->val.str) {
+				zfree(&t);
+				return NULL;
+			}
+			t->free_str = true;
+		}
+		break;
+	default:
+		t->val.val = val;
+		break;
+	}
+
+	list_add_tail(&t->list, head_terms);
 	return t;
 }
 
@@ -1142,7 +1196,7 @@ static int get_config_terms(const struct parse_events_terms *head_config,
 		struct evsel_config_term *new_term;
 		enum evsel_term_type new_type;
 		bool str_type = false;
-		u64 val;
+		u64 val = 0;
 
 		switch (term->type_term) {
 		case PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD:
@@ -1234,20 +1288,15 @@ static int get_config_terms(const struct parse_events_terms *head_config,
 			continue;
 		}
 
-		new_term = add_config_term(new_type, head_terms, term->weak);
+		/*
+		 * Note: Members evsel_config_term::val and
+		 * parse_events_term::val are unions and endianness needs
+		 * to be taken into account when changing such union members.
+		 */
+		new_term = add_config_term(new_type, head_terms, term->weak,
+					   str_type ? term->val.str : NULL, val);
 		if (!new_term)
 			return -ENOMEM;
-
-		if (str_type) {
-			new_term->val.str = strdup(term->val.str);
-			if (!new_term->val.str) {
-				zfree(&new_term);
-				return -ENOMEM;
-			}
-			new_term->free_str = true;
-		} else {
-			new_term->val.val = val;
-		}
 	}
 	return 0;
 }
@@ -1277,10 +1326,9 @@ static int add_cfg_chg(const struct perf_pmu *pmu,
 	if (bits) {
 		struct evsel_config_term *new_term;
 
-		new_term = add_config_term(new_term_type, head_terms, false);
+		new_term = add_config_term(new_term_type, head_terms, false, NULL, bits);
 		if (!new_term)
 			return -ENOMEM;
-		new_term->val.cfg_chg = bits;
 	}
 
 	return 0;

From c5a244bf17caf2de22f9e100832b75f72b31d3e6 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 6 Feb 2026 16:49:56 -0800
Subject: [PATCH 002/131] perf metricgroup: Fix
 metricgroup__has_metric_or_groups

Use metricgroup__for_each_metric rather than
pmu_metrics_table__for_each_metric that combines the default metric
table with, a potentially empty, CPUID table.

Fixes: cee275edcdb1 ("perf metricgroup: Don't early exit if no CPUID table exists")
Signed-off-by: Ian Rogers <irogers@google.com>
Reviewed-by: Leo Yan <leo.yan@arm.com>
Tested-by: Leo Yan <leo.yan@arm.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/metricgroup.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
index 46bf4dfeebc8..7e39d469111b 100644
--- a/tools/perf/util/metricgroup.c
+++ b/tools/perf/util/metricgroup.c
@@ -1605,9 +1605,9 @@ bool metricgroup__has_metric_or_groups(const char *pmu, const char *metric_or_gr
 		.metric_or_groups = metric_or_groups,
 	};
 
-	return pmu_metrics_table__for_each_metric(table,
-						  metricgroup__has_metric_or_groups_callback,
-						  &data)
+	return metricgroup__for_each_metric(table,
+					    metricgroup__has_metric_or_groups_callback,
+					    &data)
 		? true : false;
 }
 

From 0feca0b788567debbaec6a9a329f5bee1b15c705 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Tue, 17 Feb 2026 17:40:56 -0800
Subject: [PATCH 003/131] perf script: Fix brcntr output with --xed

brcntr in perf script brstack insn currently outputs

  $ perf record -j any,counter ...
  $ perf script -F +brcntr,+brstackinsn
  ...
              BC1s 3450809 5665912.127194:     100127
  cpu_core/cycles/:      7f0475d6cc89 handle_intel.constprop.0+0x2b
  (/usr/lib64/ld-linux-
  x86-64.so.2)
          intel_check_word.constprop.0+224:
          00007f0475d6ca7e        insn: 00 4b db                  br_cntr: # PRED 21 cycles [21]
  ...

This has two issues:
- The description says no event is a single dash, but that is not what is printed.
- The b in brcntr is ambigious with the hex numbers in insns, which
  breaks with --xed. It parses the b as another instruction byte and
merges the instruction with a missing b and no space:

  $ perf script -F +brstackinsn,+brcntr --xed
  ...
          00005618c6d683b5                        jnz 0x5618c6d683bdr_cntr:       # PRED 5 cycles [1396] 8.60 IPC

This patches fixes these two problems. It moves the brcntr output into
the "#" comment which also looks nicer and also fixes the no event case.

  $ perf script -F +brstackinsn,+brcntr --xed
  ...
          00005618c6d6624f                        jnz 0x5618c6d65fb7 # br_cntr: -  MISPRED 1 cycles [1398] 3.00 IPC

Since the old broken format has shipped for a few releases there is a
risk of breaking some existing parser, but since this is a obscure
feature I hope they're not too common and can adapt.

Signed-off-by: Andi Kleen <andi@firstfloor.org>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/builtin-script.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 7c743a303507..9f8b0fd27a0a 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -1271,11 +1271,11 @@ static int ip__fprintf_jump(uint64_t ip, struct branch_entry *en,
 
 	if (PRINT_FIELD(BRCNTR)) {
 		struct evsel *pos = evsel__leader(evsel);
-		unsigned int i = 0, j, num, mask, width;
+		unsigned int i = 0, j, num, mask, width, numprinted = 0;
 
 		perf_env__find_br_cntr_info(evsel__env(evsel), NULL, &width);
 		mask = (1L << width) - 1;
-		printed += fprintf(fp, "br_cntr: ");
+		printed += fprintf(fp, "\t# br_cntr: ");
 		evlist__for_each_entry_from(evsel->evlist, pos) {
 			if (!(pos->core.attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS))
 				continue;
@@ -1283,16 +1283,20 @@ static int ip__fprintf_jump(uint64_t ip, struct branch_entry *en,
 				break;
 
 			num = (br_cntr >> (i++ * width)) & mask;
+			numprinted += num;
 			if (!verbose) {
 				for (j = 0; j < num; j++)
 					printed += fprintf(fp, "%s", pos->abbr_name);
 			} else
 				printed += fprintf(fp, "%s %d ", pos->name, num);
 		}
-		printed += fprintf(fp, "\t");
+		if (numprinted == 0 && !verbose)
+			printed += fprintf(fp, "-");
+		printed += fprintf(fp, " ");
 	}
 
-	printed += fprintf(fp, "#%s%s%s%s",
+	printed += fprintf(fp, "%s%s%s%s%s",
+			      !PRINT_FIELD(BRCNTR) ? "#" : "",
 			      en->flags.predicted ? " PRED" : "",
 			      en->flags.mispred ? " MISPRED" : "",
 			      en->flags.in_tx ? " INTX" : "",

From 96f202eab8133f94479b14a32902c636e9bdf6af Mon Sep 17 00:00:00 2001
From: wangguangju <wangguangju@hygon.cn>
Date: Thu, 26 Feb 2026 20:22:08 +0800
Subject: [PATCH 004/131] perf trace: Fix IS_ERR() vs NULL check bug

The alloc_syscall_stats() function always returns an error pointer
(ERR_PTR) on failure.

So replace NULL check with IS_ERR() check after calling
delete_syscall_stats() function.

Fixes: ef2da619b132c6f74 ("perf trace: Convert syscall_stats to hashmap")
Signed-off-by: wangguangju <wangguangju@hygon.cn>
Reviewed-by: Howard Chu <howardchu95@gmail.com>
Acked-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/builtin-trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 311d9da9896a..295b272c6c29 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -1573,7 +1573,7 @@ static void delete_syscall_stats(struct hashmap *syscall_stats)
 	struct hashmap_entry *pos;
 	size_t bkt;
 
-	if (syscall_stats == NULL)
+	if (IS_ERR(syscall_stats))
 		return;
 
 	hashmap__for_each_entry(syscall_stats, pos, bkt)

From af894feb32570cafea582b100d674b042479544f Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 25 Feb 2026 17:49:55 -0800
Subject: [PATCH 005/131] perf trace: Handle task exit in BPF syscall summary

Some system calls never return because it'd terminate the calling
thread.  Let's hook the task exit path and update the duration of the
last syscall.

Before:
  $ sudo perf trace -as --bpf-summary -- true |& grep exit
  (nothing)

After:
  $ sudo perf trace -as --bpf-summary -- true |& grep exit
     exit_group             1      0     0.004     0.004     0.004     0.004      0.00%

Reviewed-by: Ian Rogers <irogers@google.com>
Acked-by: Howard Chu <howardchu95@gmail.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/bpf_skel/syscall_summary.bpf.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/tools/perf/util/bpf_skel/syscall_summary.bpf.c b/tools/perf/util/bpf_skel/syscall_summary.bpf.c
index 1bcd066a5199..4172f3c9fc48 100644
--- a/tools/perf/util/bpf_skel/syscall_summary.bpf.c
+++ b/tools/perf/util/bpf_skel/syscall_summary.bpf.c
@@ -118,13 +118,11 @@ int sys_enter(u64 *ctx)
 	return 0;
 }
 
-SEC("tp_btf/sys_exit")
-int sys_exit(u64 *ctx)
+static int do_exit(long ret)
 {
 	int tid;
 	int key = 0;
 	u64 cgroup = 0;
-	long ret = ctx[1]; /* return value of the syscall */
 	struct syscall_trace *st;
 	s64 delta;
 
@@ -150,4 +148,18 @@ int sys_exit(u64 *ctx)
 	return 0;
 }
 
+SEC("tp_btf/sys_exit")
+int sys_exit(u64 *ctx)
+{
+	long ret = ctx[1]; /* return value of the syscall */
+
+	return do_exit(ret);
+}
+
+SEC("tp_btf/sched_process_exit")
+int process_exit(u64 *ctx)
+{
+	return do_exit(0);
+}
+
 char _license[] SEC("license") = "GPL";

From c1f70c83be55e6721267f850dbfaf2ae07a04858 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 18 Feb 2026 16:44:17 -0800
Subject: [PATCH 006/131] perf bench: Add -t/--threads option to perf bench mem
 mmap

So that it can measure overhead of mmap_lock and/or per-VMA lock
contention.

  $ perf bench mem mmap -f demand -l 1000 -t 1
  # Running 'mem/mmap' benchmark:
  # function 'demand' (Demand loaded mmap())
  # Copying 1MB bytes ...

         2.786858 GB/sec

  $ perf bench mem mmap -f demand -l 1000 -t 2
  # Running 'mem/mmap' benchmark:
  # function 'demand' (Demand loaded mmap())
  # Copying 1MB bytes ...

         1.624468 GB/sec/thread   ( +-   0.30% )

  $ perf bench mem mmap -f demand -l 1000 -t 3
  # Running 'mem/mmap' benchmark:
  # function 'demand' (Demand loaded mmap())
  # Copying 1MB bytes ...

         1.493068 GB/sec/thread   ( +-   0.15% )

  $ perf bench mem mmap -f demand -l 1000 -t 4
  # Running 'mem/mmap' benchmark:
  # function 'demand' (Demand loaded mmap())
  # Copying 1MB bytes ...

         1.006087 GB/sec/thread   ( +-   0.41% )

Reviewed-by: Ankur Arora <ankur.a.arora@oracle.com>
Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/Documentation/perf-bench.txt |   4 +
 tools/perf/bench/mem-functions.c        | 109 +++++++++++++++++++-----
 2 files changed, 92 insertions(+), 21 deletions(-)

diff --git a/tools/perf/Documentation/perf-bench.txt b/tools/perf/Documentation/perf-bench.txt
index 1160224cb718..c5913cf59c98 100644
--- a/tools/perf/Documentation/perf-bench.txt
+++ b/tools/perf/Documentation/perf-bench.txt
@@ -274,6 +274,10 @@ Repeat mmap() invocation this number of times.
 --cycles::
 Use perf's cpu-cycles event instead of gettimeofday syscall.
 
+-t::
+--threads=<NUM>::
+Create multiple threads to call mmap/munmap concurrently.
+
 SUITES FOR 'numa'
 ~~~~~~~~~~~~~~~~~
 *mem*::
diff --git a/tools/perf/bench/mem-functions.c b/tools/perf/bench/mem-functions.c
index 2908a3a796c9..f5ab41bb85bf 100644
--- a/tools/perf/bench/mem-functions.c
+++ b/tools/perf/bench/mem-functions.c
@@ -7,13 +7,14 @@
  * Written by Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
  */
 
-#include "debug.h"
+#include "bench.h"
 #include "../perf-sys.h"
 #include <subcmd/parse-options.h>
-#include "../util/header.h"
-#include "../util/cloexec.h"
-#include "../util/string2.h"
-#include "bench.h"
+#include "util/cloexec.h"
+#include "util/debug.h"
+#include "util/header.h"
+#include "util/stat.h"
+#include "util/string2.h"
 #include "mem-memcpy-arch.h"
 #include "mem-memset-arch.h"
 
@@ -26,6 +27,7 @@
 #include <errno.h>
 #include <linux/time64.h>
 #include <linux/log2.h>
+#include <pthread.h>
 
 #define K 1024
 
@@ -41,6 +43,7 @@ static unsigned int	nr_loops	= 1;
 static bool		use_cycles;
 static int		cycles_fd;
 static unsigned int	seed;
+static unsigned int	nr_threads	= 1;
 
 static const struct option bench_common_options[] = {
 	OPT_STRING('s', "size", &size_str, "1MB",
@@ -121,6 +124,8 @@ static struct perf_event_attr cycle_attr = {
 	.config		= PERF_COUNT_HW_CPU_CYCLES
 };
 
+static struct stats stats;
+
 static int init_cycles(void)
 {
 	cycles_fd = sys_perf_event_open(&cycle_attr, getpid(), -1, -1, perf_event_open_cloexec_flag());
@@ -174,18 +179,18 @@ static void clock_accum(union bench_clock *a, union bench_clock *b)
 
 static double timeval2double(struct timeval *ts)
 {
-	return (double)ts->tv_sec + (double)ts->tv_usec / (double)USEC_PER_SEC;
+	return ((double)ts->tv_sec + (double)ts->tv_usec / (double)USEC_PER_SEC) / nr_threads;
 }
 
 #define print_bps(x) do {						\
 		if (x < K)						\
-			printf(" %14lf bytes/sec\n", x);		\
+			printf(" %14lf bytes/sec", x);			\
 		else if (x < K * K)					\
-			printf(" %14lfd KB/sec\n", x / K);		\
+			printf(" %14lfd KB/sec", x / K);		\
 		else if (x < K * K * K)					\
-			printf(" %14lf MB/sec\n", x / K / K);		\
+			printf(" %14lf MB/sec", x / K / K);		\
 		else							\
-			printf(" %14lf GB/sec\n", x / K / K / K);	\
+			printf(" %14lf GB/sec", x / K / K / K);	\
 	} while (0)
 
 static void __bench_mem_function(struct bench_mem_info *info, struct bench_params *p,
@@ -196,6 +201,7 @@ static void __bench_mem_function(struct bench_mem_info *info, struct bench_param
 	union bench_clock rt = { 0 };
 	void *src = NULL, *dst = NULL;
 
+	init_stats(&stats);
 	printf("# function '%s' (%s)\n", r->name, r->desc);
 
 	if (r->fn.init && r->fn.init(info, p, &src, &dst))
@@ -210,11 +216,16 @@ static void __bench_mem_function(struct bench_mem_info *info, struct bench_param
 	switch (bench_format) {
 	case BENCH_FORMAT_DEFAULT:
 		if (use_cycles) {
-			printf(" %14lf cycles/byte\n", (double)rt.cycles/(double)p->size_total);
+			printf(" %14lf cycles/byte", (double)rt.cycles/(double)p->size_total);
 		} else {
 			result_bps = (double)p->size_total/timeval2double(&rt.tv);
 			print_bps(result_bps);
 		}
+		if (nr_threads > 1) {
+			printf("/thread\t( +- %6.2f%% )",
+			       rel_stddev_stats(stddev_stats(&stats), avg_stats(&stats)));
+		}
+		printf("\n");
 		break;
 
 	case BENCH_FORMAT_SIMPLE:
@@ -494,16 +505,27 @@ static void mmap_page_touch(void *dst, size_t size, unsigned int page_shift, boo
 	}
 }
 
-static int do_mmap(const struct function *r, struct bench_params *p,
-		  void *src __maybe_unused, void *dst __maybe_unused,
-		  union bench_clock *accum)
+struct mmap_data {
+	pthread_t id;
+	const struct function *func;
+	struct bench_params *params;
+	union bench_clock result;
+	unsigned int seed;
+	int error;
+};
+
+static void *do_mmap_thread(void *arg)
 {
+	struct mmap_data *data = arg;
+	const struct function *r = data->func;
+	struct bench_params *p = data->params;
 	union bench_clock start, end, diff;
 	mmap_op_t fn = r->fn.mmap_op;
 	bool populate = strcmp(r->name, "populate") == 0;
+	void *dst;
 
-	if (p->seed)
-		srand(p->seed);
+	if (data->seed)
+		srand(data->seed);
 
 	for (unsigned int i = 0; i < p->nr_loops; i++) {
 		clock_get(&start);
@@ -514,16 +536,59 @@ static int do_mmap(const struct function *r, struct bench_params *p,
 		fn(dst, p->size, p->page_shift, p->seed);
 		clock_get(&end);
 		diff = clock_diff(&start, &end);
-		clock_accum(accum, &diff);
+		clock_accum(&data->result, &diff);
 
 		bench_munmap(dst, p->size);
 	}
 
-	return 0;
+	return data;
 out:
-	printf("# Memory allocation failed - maybe size (%s) %s?\n", size_str,
-			p->page_shift != PAGE_SHIFT_4KB ? "has insufficient hugepages" : "is too large");
-	return -1;
+	data->error = -ENOMEM;
+	return NULL;
+}
+
+static int do_mmap(const struct function *r, struct bench_params *p,
+		  void *src __maybe_unused, void *dst __maybe_unused,
+		  union bench_clock *accum)
+{
+	struct mmap_data *data;
+	int error = 0;
+
+	data = calloc(nr_threads, sizeof(*data));
+	if (!data) {
+		printf("# Failed to allocate thread resources\n");
+		return -1;
+	}
+
+	for (unsigned int i = 0; i < nr_threads; i++) {
+		data[i].func = r;
+		data[i].params = p;
+		if (p->seed)
+			data[i].seed = p->seed + i;
+
+		if (pthread_create(&data[i].id, NULL, do_mmap_thread, &data[i]) < 0)
+			data[i].error = -errno;
+	}
+
+	for (unsigned int i = 0; i < nr_threads; i++) {
+		union bench_clock *t = &data[i].result;
+
+		pthread_join(data[i].id, NULL);
+
+		clock_accum(accum, t);
+		if (use_cycles)
+			update_stats(&stats, t->cycles);
+		else
+			update_stats(&stats, t->tv.tv_sec * 1e6 + t->tv.tv_usec);
+		error |= data[i].error;
+	}
+	free(data);
+
+	if (error) {
+		printf("# Memory allocation failed - maybe size (%s) %s?\n", size_str,
+		       p->page_shift != PAGE_SHIFT_4KB ? "has insufficient hugepages" : "is too large");
+	}
+	return error ? -1 : 0;
 }
 
 static const char * const bench_mem_mmap_usage[] = {
@@ -548,6 +613,8 @@ int bench_mem_mmap(int argc, const char **argv)
 	static const struct option bench_mmap_options[] = {
 		OPT_UINTEGER('r', "randomize", &seed,
 			    "Seed to randomize page access offset."),
+		OPT_UINTEGER('t', "threads", &nr_threads,
+			    "Number of threads to run concurrently (default: 1)."),
 		OPT_PARENT(bench_common_options),
 		OPT_END()
 	};

From 5d580ffbb43807153a71113fd725fbf8a416d2d9 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 26 Feb 2026 09:59:27 -0800
Subject: [PATCH 007/131] perf vendor events intel: Update alderlake events
 from 1.35 to 1.37

The updated events were published in:
https://github.com/intel/perfmon/commit/632936400cfc5978c7b4519c865c137de523bfdd
https://github.com/intel/perfmon/commit/a96d6bf4b50d6ce31e2ffd0be8d13022d07ae319

Signed-off-by: Ian Rogers <irogers@google.com>
Reviewed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 .../pmu-events/arch/x86/alderlake/cache.json  | 27 +++-----
 .../arch/x86/alderlake/frontend.json          | 18 +++++
 .../arch/x86/alderlake/pipeline.json          | 66 +++++++++++++++++--
 .../pmu-events/arch/x86/alderlaken/cache.json | 27 +++-----
 .../arch/x86/alderlaken/pipeline.json         | 60 +++++++++++++++--
 tools/perf/pmu-events/arch/x86/mapfile.csv    |  4 +-
 6 files changed, 152 insertions(+), 50 deletions(-)

diff --git a/tools/perf/pmu-events/arch/x86/alderlake/cache.json b/tools/perf/pmu-events/arch/x86/alderlake/cache.json
index be15a7f83717..5d0d824f3e7e 100644
--- a/tools/perf/pmu-events/arch/x86/alderlake/cache.json
+++ b/tools/perf/pmu-events/arch/x86/alderlake/cache.json
@@ -876,105 +876,97 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 128 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 128. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_128",
         "MSRIndex": "0x3F6",
         "MSRValue": "0x80",
-        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 128 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
         "SampleAfterValue": "1000003",
         "UMask": "0x5",
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 16 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 16. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_16",
         "MSRIndex": "0x3F6",
         "MSRValue": "0x10",
-        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 16 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
         "SampleAfterValue": "1000003",
         "UMask": "0x5",
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 256 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 256. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_256",
         "MSRIndex": "0x3F6",
         "MSRValue": "0x100",
-        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 256 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
         "SampleAfterValue": "1000003",
         "UMask": "0x5",
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 32 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 32. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_32",
         "MSRIndex": "0x3F6",
         "MSRValue": "0x20",
-        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 32 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
         "SampleAfterValue": "1000003",
         "UMask": "0x5",
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 4 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 4. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_4",
         "MSRIndex": "0x3F6",
         "MSRValue": "0x4",
-        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 4 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
         "SampleAfterValue": "1000003",
         "UMask": "0x5",
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 512 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 512. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_512",
         "MSRIndex": "0x3F6",
         "MSRValue": "0x200",
-        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 512 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
         "SampleAfterValue": "1000003",
         "UMask": "0x5",
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 64 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 64. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_64",
         "MSRIndex": "0x3F6",
         "MSRValue": "0x40",
-        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 64 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
         "SampleAfterValue": "1000003",
         "UMask": "0x5",
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 8 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 8. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_8",
         "MSRIndex": "0x3F6",
         "MSRValue": "0x8",
-        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 8 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
         "SampleAfterValue": "1000003",
         "UMask": "0x5",
         "Unit": "cpu_atom"
@@ -1030,12 +1022,11 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of stores uops retired. Counts with or without PEBS enabled.",
+        "BriefDescription": "Counts the number of stores uops retired.",
         "Counter": "0,1,2,3,4,5",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.STORE_LATENCY",
-        "PublicDescription": "Counts the number of stores uops retired. Counts with or without PEBS enabled. If PEBS is enabled and a PEBS record is generated, will populate PEBS Latency and PEBS Data Source fields accordingly.",
         "SampleAfterValue": "1000003",
         "UMask": "0x6",
         "Unit": "cpu_atom"
diff --git a/tools/perf/pmu-events/arch/x86/alderlake/frontend.json b/tools/perf/pmu-events/arch/x86/alderlake/frontend.json
index ff3b30c2619a..11fc853f2d0b 100644
--- a/tools/perf/pmu-events/arch/x86/alderlake/frontend.json
+++ b/tools/perf/pmu-events/arch/x86/alderlake/frontend.json
@@ -327,6 +327,24 @@
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
+    {
+        "BriefDescription": "ICACHE_TAG.STALLS_INUSE",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x83",
+        "EventName": "ICACHE_TAG.STALLS_INUSE",
+        "SampleAfterValue": "200003",
+        "UMask": "0x10",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "ICACHE_TAG.STALLS_ISB",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x83",
+        "EventName": "ICACHE_TAG.STALLS_ISB",
+        "SampleAfterValue": "200003",
+        "UMask": "0x8",
+        "Unit": "cpu_core"
+    },
     {
         "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering any Uop",
         "Counter": "0,1,2,3",
diff --git a/tools/perf/pmu-events/arch/x86/alderlake/pipeline.json b/tools/perf/pmu-events/arch/x86/alderlake/pipeline.json
index 57a8c78cdc49..80cad3c49d20 100644
--- a/tools/perf/pmu-events/arch/x86/alderlake/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/alderlake/pipeline.json
@@ -244,6 +244,15 @@
         "UMask": "0xfb",
         "Unit": "cpu_atom"
     },
+    {
+        "BriefDescription": "Counts the number of near indirect JMP branch instructions retired.",
+        "Counter": "0,1,2,3,4,5",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.INDIRECT_JMP",
+        "SampleAfterValue": "200003",
+        "UMask": "0xef",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "This event is deprecated. Refer to new event BR_INST_RETIRED.INDIRECT_CALL",
         "Counter": "0,1,2,3,4,5",
@@ -464,6 +473,15 @@
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
+    {
+        "BriefDescription": "Counts the number of mispredicted near indirect JMP branch instructions retired.",
+        "Counter": "0,1,2,3,4,5",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.INDIRECT_JMP",
+        "SampleAfterValue": "200003",
+        "UMask": "0xef",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "This event is deprecated. Refer to new event BR_MISP_RETIRED.INDIRECT_CALL",
         "Counter": "0,1,2,3,4,5",
@@ -573,7 +591,7 @@
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Counts the number of unhalted core clock cycles. (Fixed event)",
+        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles. [This event is alias to CPU_CLK_UNHALTED.THREAD]",
         "Counter": "Fixed counter 1",
         "EventName": "CPU_CLK_UNHALTED.CORE",
         "PublicDescription": "Counts the number of core cycles while the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. The core frequency may change from time to time. For this reason this event may have a changing ratio with regards to time. This event uses fixed counter 1.",
@@ -582,7 +600,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of unhalted core clock cycles.",
+        "BriefDescription": "Counts the number of unhalted core clock cycles. [This event is alias to CPU_CLK_UNHALTED.THREAD_P]",
         "Counter": "0,1,2,3,4,5",
         "EventCode": "0x3c",
         "EventName": "CPU_CLK_UNHALTED.CORE_P",
@@ -651,7 +669,7 @@
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Counts the number of unhalted reference clock cycles at TSC frequency. (Fixed event)",
+        "BriefDescription": "Fixed Counter: Counts the number of unhalted reference clock cycles at TSC frequency.",
         "Counter": "Fixed counter 2",
         "EventName": "CPU_CLK_UNHALTED.REF_TSC",
         "PublicDescription": "Counts the number of reference cycles that the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. This event is not affected by core frequency changes and increments at a fixed frequency that is also used for the Time Stamp Counter (TSC). This event uses fixed counter 2.",
@@ -689,7 +707,7 @@
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Counts the number of unhalted core clock cycles. (Fixed event)",
+        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles. [This event is alias to CPU_CLK_UNHALTED.CORE]",
         "Counter": "Fixed counter 1",
         "EventName": "CPU_CLK_UNHALTED.THREAD",
         "PublicDescription": "Counts the number of core cycles while the core is not in a halt state.  The core enters the halt state when it is running the HLT instruction. The core frequency may change from time to time. For this reason this event may have a changing ratio with regards to time.  This event uses fixed counter 1.",
@@ -707,7 +725,7 @@
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Counts the number of unhalted core clock cycles.",
+        "BriefDescription": "Counts the number of unhalted core clock cycles. [This event is alias to CPU_CLK_UNHALTED.CORE_P]",
         "Counter": "0,1,2,3,4,5",
         "EventCode": "0x3c",
         "EventName": "CPU_CLK_UNHALTED.THREAD_P",
@@ -875,7 +893,7 @@
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Counts the total number of instructions retired. (Fixed event)",
+        "BriefDescription": "Fixed Counter: Counts the total number of instructions retired.",
         "Counter": "Fixed counter 0",
         "EventName": "INST_RETIRED.ANY",
         "PublicDescription": "Counts the total number of instructions that retired. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. This event continues counting during hardware interrupts, traps, and inside interrupt handlers. This event uses fixed counter 0. Available PDIST counters: 32",
@@ -1273,6 +1291,42 @@
         "UMask": "0x20",
         "Unit": "cpu_core"
     },
+    {
+        "BriefDescription": "Counts the number of CLFLUSH, CLWB, and CLDEMOTE instructions retired.",
+        "Counter": "0,1,2,3,4,5",
+        "EventCode": "0xe0",
+        "EventName": "MISC_RETIRED1.CL_INST",
+        "SampleAfterValue": "1000003",
+        "UMask": "0xff",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of LFENCE instructions retired.",
+        "Counter": "0,1,2,3,4,5",
+        "EventCode": "0xe0",
+        "EventName": "MISC_RETIRED1.LFENCE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of accesses to KeyLocker cache.",
+        "Counter": "0,1,2,3,4,5",
+        "EventCode": "0xe1",
+        "EventName": "MISC_RETIRED2.KEYLOCKER_ACCESS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x10",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of misses to KeyLocker cache.",
+        "Counter": "0,1,2,3,4,5",
+        "EventCode": "0xe1",
+        "EventName": "MISC_RETIRED2.KEYLOCKER_MISS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x11",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Cycles stalled due to no store buffers available. (not including draining form sync).",
         "Counter": "0,1,2,3,4,5,6,7",
diff --git a/tools/perf/pmu-events/arch/x86/alderlaken/cache.json b/tools/perf/pmu-events/arch/x86/alderlaken/cache.json
index 76a841675337..1f97a4dc6fb1 100644
--- a/tools/perf/pmu-events/arch/x86/alderlaken/cache.json
+++ b/tools/perf/pmu-events/arch/x86/alderlaken/cache.json
@@ -246,98 +246,90 @@
         "UMask": "0x82"
     },
     {
-        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 128 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 128. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_128",
         "MSRIndex": "0x3F6",
         "MSRValue": "0x80",
-        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 128 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
         "SampleAfterValue": "1000003",
         "UMask": "0x5"
     },
     {
-        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 16 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 16. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_16",
         "MSRIndex": "0x3F6",
         "MSRValue": "0x10",
-        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 16 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
         "SampleAfterValue": "1000003",
         "UMask": "0x5"
     },
     {
-        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 256 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 256. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_256",
         "MSRIndex": "0x3F6",
         "MSRValue": "0x100",
-        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 256 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
         "SampleAfterValue": "1000003",
         "UMask": "0x5"
     },
     {
-        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 32 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 32. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_32",
         "MSRIndex": "0x3F6",
         "MSRValue": "0x20",
-        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 32 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
         "SampleAfterValue": "1000003",
         "UMask": "0x5"
     },
     {
-        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 4 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 4. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_4",
         "MSRIndex": "0x3F6",
         "MSRValue": "0x4",
-        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 4 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
         "SampleAfterValue": "1000003",
         "UMask": "0x5"
     },
     {
-        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 512 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 512. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_512",
         "MSRIndex": "0x3F6",
         "MSRValue": "0x200",
-        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 512 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
         "SampleAfterValue": "1000003",
         "UMask": "0x5"
     },
     {
-        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 64 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 64. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_64",
         "MSRIndex": "0x3F6",
         "MSRValue": "0x40",
-        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 64 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
         "SampleAfterValue": "1000003",
         "UMask": "0x5"
     },
     {
-        "BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 8 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 8. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_8",
         "MSRIndex": "0x3F6",
         "MSRValue": "0x8",
-        "PublicDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 8 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled. If a PEBS record is generated, will populate the PEBS Latency and PEBS Data Source fields accordingly.",
         "SampleAfterValue": "1000003",
         "UMask": "0x5"
     },
@@ -387,12 +379,11 @@
         "UMask": "0x12"
     },
     {
-        "BriefDescription": "Counts the number of stores uops retired. Counts with or without PEBS enabled.",
+        "BriefDescription": "Counts the number of stores uops retired.",
         "Counter": "0,1,2,3,4,5",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.STORE_LATENCY",
-        "PublicDescription": "Counts the number of stores uops retired. Counts with or without PEBS enabled. If PEBS is enabled and a PEBS record is generated, will populate PEBS Latency and PEBS Data Source fields accordingly.",
         "SampleAfterValue": "1000003",
         "UMask": "0x6"
     },
diff --git a/tools/perf/pmu-events/arch/x86/alderlaken/pipeline.json b/tools/perf/pmu-events/arch/x86/alderlaken/pipeline.json
index d650cbd48c1f..a13851071624 100644
--- a/tools/perf/pmu-events/arch/x86/alderlaken/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/alderlaken/pipeline.json
@@ -108,6 +108,14 @@
         "SampleAfterValue": "200003",
         "UMask": "0xfb"
     },
+    {
+        "BriefDescription": "Counts the number of near indirect JMP branch instructions retired.",
+        "Counter": "0,1,2,3,4,5",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.INDIRECT_JMP",
+        "SampleAfterValue": "200003",
+        "UMask": "0xef"
+    },
     {
         "BriefDescription": "This event is deprecated. Refer to new event BR_INST_RETIRED.INDIRECT_CALL",
         "Counter": "0,1,2,3,4,5",
@@ -225,6 +233,14 @@
         "SampleAfterValue": "200003",
         "UMask": "0xfb"
     },
+    {
+        "BriefDescription": "Counts the number of mispredicted near indirect JMP branch instructions retired.",
+        "Counter": "0,1,2,3,4,5",
+        "EventCode": "0xc5",
+        "EventName": "BR_MISP_RETIRED.INDIRECT_JMP",
+        "SampleAfterValue": "200003",
+        "UMask": "0xef"
+    },
     {
         "BriefDescription": "This event is deprecated. Refer to new event BR_MISP_RETIRED.INDIRECT_CALL",
         "Counter": "0,1,2,3,4,5",
@@ -278,7 +294,7 @@
         "UMask": "0xfe"
     },
     {
-        "BriefDescription": "Counts the number of unhalted core clock cycles. (Fixed event)",
+        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles. [This event is alias to CPU_CLK_UNHALTED.THREAD]",
         "Counter": "Fixed counter 1",
         "EventName": "CPU_CLK_UNHALTED.CORE",
         "PublicDescription": "Counts the number of core cycles while the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. The core frequency may change from time to time. For this reason this event may have a changing ratio with regards to time. This event uses fixed counter 1.",
@@ -286,7 +302,7 @@
         "UMask": "0x2"
     },
     {
-        "BriefDescription": "Counts the number of unhalted core clock cycles.",
+        "BriefDescription": "Counts the number of unhalted core clock cycles. [This event is alias to CPU_CLK_UNHALTED.THREAD_P]",
         "Counter": "0,1,2,3,4,5",
         "EventCode": "0x3c",
         "EventName": "CPU_CLK_UNHALTED.CORE_P",
@@ -303,7 +319,7 @@
         "UMask": "0x1"
     },
     {
-        "BriefDescription": "Counts the number of unhalted reference clock cycles at TSC frequency. (Fixed event)",
+        "BriefDescription": "Fixed Counter: Counts the number of unhalted reference clock cycles at TSC frequency.",
         "Counter": "Fixed counter 2",
         "EventName": "CPU_CLK_UNHALTED.REF_TSC",
         "PublicDescription": "Counts the number of reference cycles that the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. This event is not affected by core frequency changes and increments at a fixed frequency that is also used for the Time Stamp Counter (TSC). This event uses fixed counter 2.",
@@ -320,7 +336,7 @@
         "UMask": "0x1"
     },
     {
-        "BriefDescription": "Counts the number of unhalted core clock cycles. (Fixed event)",
+        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles. [This event is alias to CPU_CLK_UNHALTED.CORE]",
         "Counter": "Fixed counter 1",
         "EventName": "CPU_CLK_UNHALTED.THREAD",
         "PublicDescription": "Counts the number of core cycles while the core is not in a halt state.  The core enters the halt state when it is running the HLT instruction. The core frequency may change from time to time. For this reason this event may have a changing ratio with regards to time.  This event uses fixed counter 1.",
@@ -328,7 +344,7 @@
         "UMask": "0x2"
     },
     {
-        "BriefDescription": "Counts the number of unhalted core clock cycles.",
+        "BriefDescription": "Counts the number of unhalted core clock cycles. [This event is alias to CPU_CLK_UNHALTED.CORE_P]",
         "Counter": "0,1,2,3,4,5",
         "EventCode": "0x3c",
         "EventName": "CPU_CLK_UNHALTED.THREAD_P",
@@ -336,7 +352,7 @@
         "SampleAfterValue": "2000003"
     },
     {
-        "BriefDescription": "Counts the total number of instructions retired. (Fixed event)",
+        "BriefDescription": "Fixed Counter: Counts the total number of instructions retired.",
         "Counter": "Fixed counter 0",
         "EventName": "INST_RETIRED.ANY",
         "PublicDescription": "Counts the total number of instructions that retired. For instructions that consist of multiple uops, this event counts the retirement of the last uop of the instruction. This event continues counting during hardware interrupts, traps, and inside interrupt handlers. This event uses fixed counter 0. Available PDIST counters: 32",
@@ -426,6 +442,38 @@
         "SampleAfterValue": "1000003",
         "UMask": "0x1"
     },
+    {
+        "BriefDescription": "Counts the number of CLFLUSH, CLWB, and CLDEMOTE instructions retired.",
+        "Counter": "0,1,2,3,4,5",
+        "EventCode": "0xe0",
+        "EventName": "MISC_RETIRED1.CL_INST",
+        "SampleAfterValue": "1000003",
+        "UMask": "0xff"
+    },
+    {
+        "BriefDescription": "Counts the number of LFENCE instructions retired.",
+        "Counter": "0,1,2,3,4,5",
+        "EventCode": "0xe0",
+        "EventName": "MISC_RETIRED1.LFENCE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of accesses to KeyLocker cache.",
+        "Counter": "0,1,2,3,4,5",
+        "EventCode": "0xe1",
+        "EventName": "MISC_RETIRED2.KEYLOCKER_ACCESS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x10"
+    },
+    {
+        "BriefDescription": "Counts the number of misses to KeyLocker cache.",
+        "Counter": "0,1,2,3,4,5",
+        "EventCode": "0xe1",
+        "EventName": "MISC_RETIRED2.KEYLOCKER_MISS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x11"
+    },
     {
         "BriefDescription": "Counts the number of issue slots in a UMWAIT or TPAUSE instruction where no uop issues due to the instruction putting the CPU into the C0.1 activity state. For Tremont, UMWAIT and TPAUSE will only put the CPU into C0.1 activity state (not C0.2 activity state)",
         "Counter": "0,1,2,3,4,5",
diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv
index 149bbe7abaf5..9370722dc564 100644
--- a/tools/perf/pmu-events/arch/x86/mapfile.csv
+++ b/tools/perf/pmu-events/arch/x86/mapfile.csv
@@ -1,6 +1,6 @@
 Family-model,Version,Filename,EventType
-GenuineIntel-6-(97|9A|B7|BA|BF),v1.35,alderlake,core
-GenuineIntel-6-BE,v1.35,alderlaken,core
+GenuineIntel-6-(97|9A|B7|BA|BF),v1.37,alderlake,core
+GenuineIntel-6-BE,v1.37,alderlaken,core
 GenuineIntel-6-C[56],v1.14,arrowlake,core
 GenuineIntel-6-(1C|26|27|35|36),v5,bonnell,core
 GenuineIntel-6-(3D|47),v30,broadwell,core

From 171923140876fa243e7de63a5cc2f3f0eaa48642 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 26 Feb 2026 09:59:28 -0800
Subject: [PATCH 008/131] perf vendor events intel: Update arrowlake events
 from 1.14 to 1.16

The updated events were published in:
https://github.com/intel/perfmon/commit/f0267f720eeab3b5416886c9e0e132fafcb38bbd
https://github.com/intel/perfmon/commit/d40cfa317e567fb5e8f6cbd92c81feeb7e6bd3dd

Signed-off-by: Ian Rogers <irogers@google.com>
Reviewed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 .../pmu-events/arch/x86/arrowlake/cache.json  | 103 ++++++++++++++----
 .../arch/x86/arrowlake/frontend.json          |  18 +++
 .../arch/x86/arrowlake/pipeline.json          |  40 +++++--
 tools/perf/pmu-events/arch/x86/mapfile.csv    |   2 +-
 4 files changed, 135 insertions(+), 28 deletions(-)

diff --git a/tools/perf/pmu-events/arch/x86/arrowlake/cache.json b/tools/perf/pmu-events/arch/x86/arrowlake/cache.json
index fba4a0672f6c..4c3aa1fab5a8 100644
--- a/tools/perf/pmu-events/arch/x86/arrowlake/cache.json
+++ b/tools/perf/pmu-events/arch/x86/arrowlake/cache.json
@@ -628,6 +628,15 @@
         "UMask": "0x7f",
         "Unit": "cpu_atom"
     },
+    {
+        "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to an instruction cache or TLB miss.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x35",
+        "EventName": "MEM_BOUND_STALLS_IFETCH.ALL",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x7f",
+        "Unit": "cpu_lowpower"
+    },
     {
         "BriefDescription": "Counts the number of cycles the core is stalled due to an instruction cache or TLB miss which hit in the L2 cache.",
         "Counter": "0,1,2,3,4,5,6,7",
@@ -731,6 +740,24 @@
         "UMask": "0x6",
         "Unit": "cpu_atom"
     },
+    {
+        "BriefDescription": "Counts the number of unhalted cycles that the core is stalled due to a demand load miss which hit in the LLC, no snoop was required, and the LLC provided data",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS_LOAD.LLC_HIT_NOSNOOP",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to a demand load miss which hit in the LLC, a snoop was required, the snoop misses or the snoop hits but no fwd. LLC provides the data",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS_LOAD.LLC_HIT_SNOOP",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x4",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to a demand load miss which missed all the local caches.",
         "Counter": "0,1,2,3,4,5,6,7",
@@ -749,6 +776,24 @@
         "UMask": "0x78",
         "Unit": "cpu_lowpower"
     },
+    {
+        "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to a demand load miss which missed all the caches.  DRAM, MMIO or other LOCAL memory type provides the data",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS_LOAD.LLC_MISS_LOCALMEM",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x50",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of unhalted cycles when the core is stalled to a demand load miss and the data was provided from an unknown source. If the core has access to an L3 cache, an LLC miss refers to an L3 cache miss, otherwise it is an L2 cache miss.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS_LOAD.LLC_MISS_LOCALMEM",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x50",
+        "Unit": "cpu_lowpower"
+    },
     {
         "BriefDescription": "Counts the number of unhalted cycles when the core is stalled to a store buffer full condition",
         "Counter": "0,1,2,3,4,5,6,7",
@@ -1081,6 +1126,15 @@
         "UMask": "0x20",
         "Unit": "cpu_core"
     },
+    {
+        "BriefDescription": "Counts the number of retired load ops with an unknown source",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xd4",
+        "EventName": "MEM_LOAD_UOPS_MISC_RETIRED.LOCAL_DRAM",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Counts the number of load ops retired that miss the L3 cache and hit in DRAM",
         "Counter": "0,1,2,3,4,5,6,7",
@@ -1181,6 +1235,15 @@
         "UMask": "0x1c",
         "Unit": "cpu_lowpower"
     },
+    {
+        "BriefDescription": "Counts the number of load ops retired that hit in the L3 cache in which no snoop was required",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xd1",
+        "EventName": "MEM_LOAD_UOPS_RETIRED.L3_HIT_NO_SNOOP",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x4",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Counts the number of loads that hit in a write combining buffer (WCB), excluding the first load that caused the WCB to allocate.",
         "Counter": "0,1,2,3,4,5,6,7",
@@ -1331,7 +1394,7 @@
         "Unit": "cpu_lowpower"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 1024. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1343,7 +1406,7 @@
         "Unit": "cpu_lowpower"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 128.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1355,7 +1418,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 128. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1367,7 +1430,7 @@
         "Unit": "cpu_lowpower"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 16.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1379,7 +1442,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 16. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1391,7 +1454,7 @@
         "Unit": "cpu_lowpower"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 2048. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1403,7 +1466,7 @@
         "Unit": "cpu_lowpower"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 256.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1415,7 +1478,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 256. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1427,7 +1490,7 @@
         "Unit": "cpu_lowpower"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 32.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1439,7 +1502,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 32. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1451,7 +1514,7 @@
         "Unit": "cpu_lowpower"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 4.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1463,7 +1526,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 4. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1475,7 +1538,7 @@
         "Unit": "cpu_lowpower"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 512.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1487,7 +1550,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 512. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1499,7 +1562,7 @@
         "Unit": "cpu_lowpower"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 64.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1511,7 +1574,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 64. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1523,7 +1586,7 @@
         "Unit": "cpu_lowpower"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 8.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1535,7 +1598,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 8. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1707,7 +1770,7 @@
         "Unit": "cpu_lowpower"
     },
     {
-        "BriefDescription": "Counts the number of  stores uops retired same as MEM_UOPS_RETIRED.ALL_STORES",
+        "BriefDescription": "Counts the number of stores uops retired.",
         "Counter": "0,1,2,3,4,5,6,7",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1717,7 +1780,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of  stores uops retired same as MEM_UOPS_RETIRED.ALL_STORES",
+        "BriefDescription": "Counts the number of stores uops retired.",
         "Counter": "0,1,2,3,4,5,6,7",
         "Data_LA": "1",
         "EventCode": "0xd0",
diff --git a/tools/perf/pmu-events/arch/x86/arrowlake/frontend.json b/tools/perf/pmu-events/arch/x86/arrowlake/frontend.json
index a15de050a76c..21f00eafa98a 100644
--- a/tools/perf/pmu-events/arch/x86/arrowlake/frontend.json
+++ b/tools/perf/pmu-events/arch/x86/arrowlake/frontend.json
@@ -627,6 +627,24 @@
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
+    {
+        "BriefDescription": "Cycles where a code fetch is stalled due to L1 instruction cache In use-full",
+        "Counter": "0,1,2,3,4,5,6,7,8,9",
+        "EventCode": "0x83",
+        "EventName": "ICACHE_TAG.STALLS_INUSE",
+        "SampleAfterValue": "200003",
+        "UMask": "0x10",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Cycles where a code fetch is stalled due to L1 instruction cache ISB-full",
+        "Counter": "0,1,2,3,4,5,6,7,8,9",
+        "EventCode": "0x83",
+        "EventName": "ICACHE_TAG.STALLS_ISB",
+        "SampleAfterValue": "200003",
+        "UMask": "0x8",
+        "Unit": "cpu_core"
+    },
     {
         "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering any Uop",
         "Counter": "0,1,2,3,4,5,6,7,8,9",
diff --git a/tools/perf/pmu-events/arch/x86/arrowlake/pipeline.json b/tools/perf/pmu-events/arch/x86/arrowlake/pipeline.json
index 805616052925..fb973c75be57 100644
--- a/tools/perf/pmu-events/arch/x86/arrowlake/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/arrowlake/pipeline.json
@@ -822,7 +822,7 @@
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles.",
+        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles. [This event is alias to CPU_CLK_UNHALTED.THREAD]",
         "Counter": "Fixed counter 1",
         "EventName": "CPU_CLK_UNHALTED.CORE",
         "SampleAfterValue": "2000003",
@@ -839,7 +839,7 @@
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles",
+        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles. [This event is alias to CPU_CLK_UNHALTED.THREAD]",
         "Counter": "Fixed counter 1",
         "EventName": "CPU_CLK_UNHALTED.CORE",
         "SampleAfterValue": "2000003",
@@ -909,7 +909,7 @@
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Fixed Counter: Counts the number of unhalted reference clock cycles",
+        "BriefDescription": "Fixed Counter: Counts the number of unhalted reference clock cycles.",
         "Counter": "Fixed counter 2",
         "EventName": "CPU_CLK_UNHALTED.REF_TSC",
         "SampleAfterValue": "2000003",
@@ -947,7 +947,7 @@
         "Unit": "cpu_lowpower"
     },
     {
-        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles.",
+        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles. [This event is alias to CPU_CLK_UNHALTED.CORE]",
         "Counter": "Fixed counter 1",
         "EventName": "CPU_CLK_UNHALTED.THREAD",
         "SampleAfterValue": "2000003",
@@ -964,7 +964,7 @@
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles",
+        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles. [This event is alias to CPU_CLK_UNHALTED.CORE]",
         "Counter": "Fixed counter 1",
         "EventName": "CPU_CLK_UNHALTED.THREAD",
         "SampleAfterValue": "2000003",
@@ -1134,10 +1134,10 @@
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Fixed Counter: Counts the number of instructions retired",
+        "BriefDescription": "Fixed Counter: Counts the number of instructions retired.",
         "Counter": "Fixed counter 0",
         "EventName": "INST_RETIRED.ANY",
-        "PublicDescription": "Fixed Counter: Counts the number of instructions retired Available PDIST counters: 32",
+        "PublicDescription": "Fixed Counter: Counts the number of instructions retired. Available PDIST counters: 32",
         "SampleAfterValue": "2000003",
         "UMask": "0x1",
         "Unit": "cpu_lowpower"
@@ -1607,6 +1607,14 @@
         "SampleAfterValue": "20003",
         "Unit": "cpu_atom"
     },
+    {
+        "BriefDescription": "Counts the total number of machine clears for any reason including, but not limited to, memory ordering, memory disambiguation, SMC, and FP assist.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xc3",
+        "EventName": "MACHINE_CLEARS.ANY",
+        "SampleAfterValue": "20003",
+        "Unit": "cpu_lowpower"
+    },
     {
         "BriefDescription": "Counts the number of machine clears that flush the pipeline and restart the machine without the use of microcode.",
         "Counter": "0,1,2,3,4,5,6,7",
@@ -1813,6 +1821,15 @@
         "UMask": "0xff",
         "Unit": "cpu_atom"
     },
+    {
+        "BriefDescription": "Counts the number of CLFLUSH, CLWB, and CLDEMOTE instructions retired.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xe0",
+        "EventName": "MISC_RETIRED1.CL_INST",
+        "SampleAfterValue": "1000003",
+        "UMask": "0xff",
+        "Unit": "cpu_lowpower"
+    },
     {
         "BriefDescription": "Counts the number of LFENCE instructions retired.",
         "Counter": "0,1,2,3,4,5,6,7",
@@ -1822,6 +1839,15 @@
         "UMask": "0x2",
         "Unit": "cpu_atom"
     },
+    {
+        "BriefDescription": "Counts the number of LFENCE instructions retired.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xe0",
+        "EventName": "MISC_RETIRED1.LFENCE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2",
+        "Unit": "cpu_lowpower"
+    },
     {
         "BriefDescription": "Counts the number of RDPMC, RDTSC, and RDTSCP instructions retired.",
         "Counter": "0,1,2,3,4,5,6,7",
diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv
index 9370722dc564..7e9bc4241c61 100644
--- a/tools/perf/pmu-events/arch/x86/mapfile.csv
+++ b/tools/perf/pmu-events/arch/x86/mapfile.csv
@@ -1,7 +1,7 @@
 Family-model,Version,Filename,EventType
 GenuineIntel-6-(97|9A|B7|BA|BF),v1.37,alderlake,core
 GenuineIntel-6-BE,v1.37,alderlaken,core
-GenuineIntel-6-C[56],v1.14,arrowlake,core
+GenuineIntel-6-C[56],v1.16,arrowlake,core
 GenuineIntel-6-(1C|26|27|35|36),v5,bonnell,core
 GenuineIntel-6-(3D|47),v30,broadwell,core
 GenuineIntel-6-56,v12,broadwellde,core

From 5c0df1e860100a822d3192edcbf03c1e3b1449a2 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 26 Feb 2026 09:59:29 -0800
Subject: [PATCH 009/131] perf vendor events intel: Update emeraldrapid events
 from 1.20 to 1.21

The updated events were published in:
https://github.com/intel/perfmon/commit/210676cfa8743cd5b9e7cc984fdef1a48542eda4

Signed-off-by: Ian Rogers <irogers@google.com>
Reviewed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 .../arch/x86/emeraldrapids/cache.json           |  4 ++--
 .../arch/x86/emeraldrapids/frontend.json        | 16 ++++++++++++++++
 .../arch/x86/emeraldrapids/uncore-cache.json    |  4 ++--
 .../arch/x86/emeraldrapids/uncore-io.json       | 17 +++++++++--------
 tools/perf/pmu-events/arch/x86/mapfile.csv      |  2 +-
 5 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/cache.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/cache.json
index 26568e4b77f7..b2f8947f6741 100644
--- a/tools/perf/pmu-events/arch/x86/emeraldrapids/cache.json
+++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/cache.json
@@ -514,7 +514,7 @@
         "EventCode": "0xd3",
         "EventName": "MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM",
         "PublicDescription": "MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM Available PDIST counters: 0",
-        "SampleAfterValue": "1000003",
+        "SampleAfterValue": "100007",
         "UMask": "0x2"
     },
     {
@@ -534,7 +534,7 @@
         "EventCode": "0xd3",
         "EventName": "MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM",
         "PublicDescription": "MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM Available PDIST counters: 0",
-        "SampleAfterValue": "1000003",
+        "SampleAfterValue": "100007",
         "UMask": "0x4"
     },
     {
diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/frontend.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/frontend.json
index 793c486ffabe..e51f5e85ffd1 100644
--- a/tools/perf/pmu-events/arch/x86/emeraldrapids/frontend.json
+++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/frontend.json
@@ -271,6 +271,22 @@
         "SampleAfterValue": "200003",
         "UMask": "0x4"
     },
+    {
+        "BriefDescription": "ICACHE_TAG.STALLS_INUSE",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x83",
+        "EventName": "ICACHE_TAG.STALLS_INUSE",
+        "SampleAfterValue": "200003",
+        "UMask": "0x10"
+    },
+    {
+        "BriefDescription": "ICACHE_TAG.STALLS_ISB",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x83",
+        "EventName": "ICACHE_TAG.STALLS_ISB",
+        "SampleAfterValue": "200003",
+        "UMask": "0x8"
+    },
     {
         "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering any Uop",
         "Counter": "0,1,2,3",
diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-cache.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-cache.json
index 92cf47967f0b..3c8dcd9cff7c 100644
--- a/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-cache.json
+++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-cache.json
@@ -3501,7 +3501,7 @@
         "EventName": "UNC_CHA_SNOOP_RESP.RSPIFWD",
         "Experimental": "1",
         "PerPkg": "1",
-        "PublicDescription": "Counts when a a transaction with the opcode type RspIFwd Snoop Response was received which indicates a remote caching agent forwarded the data and the requesting agent is able to acquire the data in E (Exclusive) or M (modified) states.  This is commonly returned with RFO (the Read for Ownership issued before a write) transactions.  The snoop could have either been to a cacheline in the M,E,F (Modified, Exclusive or Forward)  states.",
+        "PublicDescription": "Counts when a transaction with the opcode type RspIFwd Snoop Response was received which indicates a remote caching agent forwarded the data and the requesting agent is able to acquire the data in E (Exclusive) or M (modified) states.  This is commonly returned with RFO (the Read for Ownership issued before a write) transactions.  The snoop could have either been to a cacheline in the M,E,F (Modified, Exclusive or Forward)  states.",
         "UMask": "0x4",
         "Unit": "CHA"
     },
@@ -3523,7 +3523,7 @@
         "EventName": "UNC_CHA_SNOOP_RESP.RSPSFWD",
         "Experimental": "1",
         "PerPkg": "1",
-        "PublicDescription": "Counts when a a transaction with the opcode type RspSFwd Snoop Response was received which indicates a remote caching agent forwarded the data but held on to its current copy.  This is common for data and code reads that hit in a remote socket in E (Exclusive) or F (Forward) state.",
+        "PublicDescription": "Counts when a transaction with the opcode type RspSFwd Snoop Response was received which indicates a remote caching agent forwarded the data but held on to its current copy.  This is common for data and code reads that hit in a remote socket in E (Exclusive) or F (Forward) state.",
         "UMask": "0x8",
         "Unit": "CHA"
     },
diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-io.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-io.json
index d4cf2199d46b..ddb0f65307f4 100644
--- a/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-io.json
+++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-io.json
@@ -223,6 +223,7 @@
         "Experimental": "1",
         "FCMask": "0x07",
         "PerPkg": "1",
+        "PortMask": "0xff",
         "UMask": "0xff",
         "Unit": "IIO"
     },
@@ -234,7 +235,7 @@
         "Experimental": "1",
         "FCMask": "0x07",
         "PerPkg": "1",
-        "PortMask": "0x0000",
+        "PortMask": "0x01",
         "PublicDescription": "x16 card plugged in to stack, Or x8 card plugged in to Lane 0/1, Or x4 card is plugged in to slot 0",
         "UMask": "0x1",
         "Unit": "IIO"
@@ -247,7 +248,7 @@
         "Experimental": "1",
         "FCMask": "0x07",
         "PerPkg": "1",
-        "PortMask": "0x0000",
+        "PortMask": "0x02",
         "PublicDescription": "x4 card is plugged in to slot 1",
         "UMask": "0x2",
         "Unit": "IIO"
@@ -260,7 +261,7 @@
         "Experimental": "1",
         "FCMask": "0x07",
         "PerPkg": "1",
-        "PortMask": "0x0000",
+        "PortMask": "0x04",
         "PublicDescription": "x8 card plugged in to Lane 2/3, Or x4 card is plugged in to slot 1",
         "UMask": "0x4",
         "Unit": "IIO"
@@ -273,7 +274,7 @@
         "Experimental": "1",
         "FCMask": "0x07",
         "PerPkg": "1",
-        "PortMask": "0x0000",
+        "PortMask": "0x08",
         "PublicDescription": "x4 card is plugged in to slot 3",
         "UMask": "0x8",
         "Unit": "IIO"
@@ -286,7 +287,7 @@
         "Experimental": "1",
         "FCMask": "0x07",
         "PerPkg": "1",
-        "PortMask": "0x0000",
+        "PortMask": "0x10",
         "PublicDescription": "x16 card plugged in to stack, Or x8 card plugged in to Lane 0/1, Or x4 card is plugged in to slot 0",
         "UMask": "0x10",
         "Unit": "IIO"
@@ -299,7 +300,7 @@
         "Experimental": "1",
         "FCMask": "0x07",
         "PerPkg": "1",
-        "PortMask": "0x0000",
+        "PortMask": "0x20",
         "PublicDescription": "x4 card is plugged in to slot 1",
         "UMask": "0x20",
         "Unit": "IIO"
@@ -312,7 +313,7 @@
         "Experimental": "1",
         "FCMask": "0x07",
         "PerPkg": "1",
-        "PortMask": "0x0000",
+        "PortMask": "0x40",
         "PublicDescription": "x8 card plugged in to Lane 2/3, Or x4 card is plugged in to slot 1",
         "UMask": "0x40",
         "Unit": "IIO"
@@ -325,7 +326,7 @@
         "Experimental": "1",
         "FCMask": "0x07",
         "PerPkg": "1",
-        "PortMask": "0x0000",
+        "PortMask": "0x80",
         "PublicDescription": "x4 card is plugged in to slot 3",
         "UMask": "0x80",
         "Unit": "IIO"
diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv
index 7e9bc4241c61..92799bc6e9d9 100644
--- a/tools/perf/pmu-events/arch/x86/mapfile.csv
+++ b/tools/perf/pmu-events/arch/x86/mapfile.csv
@@ -9,7 +9,7 @@ GenuineIntel-6-4F,v23,broadwellx,core
 GenuineIntel-6-55-[56789ABCDEF],v1.25,cascadelakex,core
 GenuineIntel-6-DD,v1.00,clearwaterforest,core
 GenuineIntel-6-9[6C],v1.05,elkhartlake,core
-GenuineIntel-6-CF,v1.20,emeraldrapids,core
+GenuineIntel-6-CF,v1.21,emeraldrapids,core
 GenuineIntel-6-5[CF],v13,goldmont,core
 GenuineIntel-6-7A,v1.01,goldmontplus,core
 GenuineIntel-6-B6,v1.10,grandridge,core

From e4f8be34479c9d29ac0b35c0c8b33250b62cfaad Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 26 Feb 2026 09:59:30 -0800
Subject: [PATCH 010/131] perf vendor events intel: Update grandridge events
 from 1.10 to 1.11

The updated events were published in:
https://github.com/intel/perfmon/commit/8ada944c087300c4fc79afcd8512aa3b91bd34f2

Signed-off-by: Ian Rogers <irogers@google.com>
Reviewed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 .../pmu-events/arch/x86/grandridge/cache.json | 42 +++++++++----------
 .../arch/x86/grandridge/pipeline.json         | 42 ++++++++++++++++---
 tools/perf/pmu-events/arch/x86/mapfile.csv    |  2 +-
 3 files changed, 59 insertions(+), 27 deletions(-)

diff --git a/tools/perf/pmu-events/arch/x86/grandridge/cache.json b/tools/perf/pmu-events/arch/x86/grandridge/cache.json
index 9abddb06a837..0aa921ba89b4 100644
--- a/tools/perf/pmu-events/arch/x86/grandridge/cache.json
+++ b/tools/perf/pmu-events/arch/x86/grandridge/cache.json
@@ -285,8 +285,8 @@
         "UMask": "0x82"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
-        "Counter": "0,1,2,3,4,5,6,7",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 1024. Only counts with PEBS enabled.",
+        "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_1024",
@@ -296,8 +296,8 @@
         "UMask": "0x5"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
-        "Counter": "0,1,2,3,4,5,6,7",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 128. Only counts with PEBS enabled.",
+        "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_128",
@@ -307,8 +307,8 @@
         "UMask": "0x5"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
-        "Counter": "0,1,2,3,4,5,6,7",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 16. Only counts with PEBS enabled.",
+        "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_16",
@@ -318,8 +318,8 @@
         "UMask": "0x5"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
-        "Counter": "0,1,2,3,4,5,6,7",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 2048. Only counts with PEBS enabled.",
+        "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_2048",
@@ -329,8 +329,8 @@
         "UMask": "0x5"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
-        "Counter": "0,1,2,3,4,5,6,7",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 256. Only counts with PEBS enabled.",
+        "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_256",
@@ -340,8 +340,8 @@
         "UMask": "0x5"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
-        "Counter": "0,1,2,3,4,5,6,7",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 32. Only counts with PEBS enabled.",
+        "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_32",
@@ -351,8 +351,8 @@
         "UMask": "0x5"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
-        "Counter": "0,1,2,3,4,5,6,7",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 4. Only counts with PEBS enabled.",
+        "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_4",
@@ -362,8 +362,8 @@
         "UMask": "0x5"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
-        "Counter": "0,1,2,3,4,5,6,7",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 512. Only counts with PEBS enabled.",
+        "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_512",
@@ -373,8 +373,8 @@
         "UMask": "0x5"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
-        "Counter": "0,1,2,3,4,5,6,7",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 64. Only counts with PEBS enabled.",
+        "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_64",
@@ -384,8 +384,8 @@
         "UMask": "0x5"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
-        "Counter": "0,1,2,3,4,5,6,7",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 8. Only counts with PEBS enabled.",
+        "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_8",
@@ -458,7 +458,7 @@
         "UMask": "0x12"
     },
     {
-        "BriefDescription": "Counts the number of  stores uops retired same as MEM_UOPS_RETIRED.ALL_STORES",
+        "BriefDescription": "Counts the number of stores uops retired.",
         "Counter": "0,1,2,3,4,5,6,7",
         "Data_LA": "1",
         "EventCode": "0xd0",
diff --git a/tools/perf/pmu-events/arch/x86/grandridge/pipeline.json b/tools/perf/pmu-events/arch/x86/grandridge/pipeline.json
index f56d8d816e53..20986b987e18 100644
--- a/tools/perf/pmu-events/arch/x86/grandridge/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/grandridge/pipeline.json
@@ -178,7 +178,7 @@
         "UMask": "0xf7"
     },
     {
-        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles",
+        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles. [This event is alias to CPU_CLK_UNHALTED.THREAD]",
         "Counter": "Fixed counter 1",
         "EventName": "CPU_CLK_UNHALTED.CORE",
         "SampleAfterValue": "2000003",
@@ -192,7 +192,7 @@
         "SampleAfterValue": "2000003"
     },
     {
-        "BriefDescription": "Fixed Counter: Counts the number of unhalted reference clock cycles",
+        "BriefDescription": "Fixed Counter: Counts the number of unhalted reference clock cycles.",
         "Counter": "Fixed counter 2",
         "EventName": "CPU_CLK_UNHALTED.REF_TSC",
         "SampleAfterValue": "2000003",
@@ -208,7 +208,7 @@
         "UMask": "0x1"
     },
     {
-        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles",
+        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles. [This event is alias to CPU_CLK_UNHALTED.CORE]",
         "Counter": "Fixed counter 1",
         "EventName": "CPU_CLK_UNHALTED.THREAD",
         "SampleAfterValue": "2000003",
@@ -222,10 +222,10 @@
         "SampleAfterValue": "2000003"
     },
     {
-        "BriefDescription": "Fixed Counter: Counts the number of instructions retired",
+        "BriefDescription": "Fixed Counter: Counts the number of instructions retired.",
         "Counter": "Fixed counter 0",
         "EventName": "INST_RETIRED.ANY",
-        "PublicDescription": "Fixed Counter: Counts the number of instructions retired Available PDIST counters: 32",
+        "PublicDescription": "Fixed Counter: Counts the number of instructions retired. Available PDIST counters: 32",
         "SampleAfterValue": "2000003",
         "UMask": "0x1"
     },
@@ -301,6 +301,38 @@
         "SampleAfterValue": "1000003",
         "UMask": "0x1"
     },
+    {
+        "BriefDescription": "Counts the number of CLFLUSH, CLWB, and CLDEMOTE instructions retired.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xe0",
+        "EventName": "MISC_RETIRED1.CL_INST",
+        "SampleAfterValue": "1000003",
+        "UMask": "0xff"
+    },
+    {
+        "BriefDescription": "Counts the number of LFENCE instructions retired.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xe0",
+        "EventName": "MISC_RETIRED1.LFENCE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of accesses to KeyLocker cache.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xe1",
+        "EventName": "MISC_RETIRED2.KEYLOCKER_ACCESS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x10"
+    },
+    {
+        "BriefDescription": "Counts the number of misses to KeyLocker cache.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xe1",
+        "EventName": "MISC_RETIRED2.KEYLOCKER_MISS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x11"
+    },
     {
         "BriefDescription": "Counts the number of issue slots in a UMWAIT or TPAUSE instruction where no uop issues due to the instruction putting the CPU into the C0.1 activity state.",
         "Counter": "0,1,2,3,4,5,6,7",
diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv
index 92799bc6e9d9..b84035dc5b4f 100644
--- a/tools/perf/pmu-events/arch/x86/mapfile.csv
+++ b/tools/perf/pmu-events/arch/x86/mapfile.csv
@@ -12,7 +12,7 @@ GenuineIntel-6-9[6C],v1.05,elkhartlake,core
 GenuineIntel-6-CF,v1.21,emeraldrapids,core
 GenuineIntel-6-5[CF],v13,goldmont,core
 GenuineIntel-6-7A,v1.01,goldmontplus,core
-GenuineIntel-6-B6,v1.10,grandridge,core
+GenuineIntel-6-B6,v1.11,grandridge,core
 GenuineIntel-6-A[DE],v1.16,graniterapids,core
 GenuineIntel-6-(3C|45|46),v36,haswell,core
 GenuineIntel-6-3F,v29,haswellx,core

From 2c0b30e6cc0e09c669a0f166ca3d0d566246d560 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 26 Feb 2026 09:59:31 -0800
Subject: [PATCH 011/131] perf vendor events intel: Update graniterapids events
 from 1.16 to 1.17

The updated events were published in:
https://github.com/intel/perfmon/commit/c9ebc3ff9c3d408a888fbfbe73d386ef86c7306f

With new IO and SNC metrics in:
https://github.com/intel/perfmon/commit/04cf5e1e804afd775401167870d48cd25864be7b
https://github.com/intel/perfmon/commit/98b2602d83de6625bae1e6fcaab3a39b0a341255

Signed-off-by: Ian Rogers <irogers@google.com>
Reviewed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 .../arch/x86/graniterapids/frontend.json      | 16 +++++++++++
 .../arch/x86/graniterapids/gnr-metrics.json   | 27 +++++++++++++++++++
 tools/perf/pmu-events/arch/x86/mapfile.csv    |  2 +-
 3 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/tools/perf/pmu-events/arch/x86/graniterapids/frontend.json b/tools/perf/pmu-events/arch/x86/graniterapids/frontend.json
index d580d305c926..1fdeaebb739f 100644
--- a/tools/perf/pmu-events/arch/x86/graniterapids/frontend.json
+++ b/tools/perf/pmu-events/arch/x86/graniterapids/frontend.json
@@ -325,6 +325,22 @@
         "SampleAfterValue": "200003",
         "UMask": "0x4"
     },
+    {
+        "BriefDescription": "ICACHE_TAG.STALLS_INUSE",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x83",
+        "EventName": "ICACHE_TAG.STALLS_INUSE",
+        "SampleAfterValue": "200003",
+        "UMask": "0x10"
+    },
+    {
+        "BriefDescription": "ICACHE_TAG.STALLS_ISB",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x83",
+        "EventName": "ICACHE_TAG.STALLS_ISB",
+        "SampleAfterValue": "200003",
+        "UMask": "0x8"
+    },
     {
         "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering any Uop",
         "Counter": "0,1,2,3",
diff --git a/tools/perf/pmu-events/arch/x86/graniterapids/gnr-metrics.json b/tools/perf/pmu-events/arch/x86/graniterapids/gnr-metrics.json
index cc3c834ca286..299631fb8d53 100644
--- a/tools/perf/pmu-events/arch/x86/graniterapids/gnr-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/graniterapids/gnr-metrics.json
@@ -143,6 +143,12 @@
         "MetricName": "io_full_write_l3_miss",
         "ScaleUnit": "100%"
     },
+    {
+        "BriefDescription": "The number of times per second that ownership of a cacheline was stolen from the integrated IO controller before it was able to write back the modified line",
+        "MetricExpr": "(UNC_I_MISC1.LOST_FWD + UNC_I_MISC1.SEC_RCVD_INVLD) / duration_time",
+        "MetricName": "io_lost_fwd",
+        "ScaleUnit": "1per_sec"
+    },
     {
         "BriefDescription": "Message Signaled Interrupts (MSI) per second sent by the integrated I/O traffic controller (IIO) to System Configuration Controller (Ubox)",
         "MetricExpr": "UNC_IIO_NUM_REQ_OF_CPU_BY_TGT.UBOX_POSTED / duration_time",
@@ -294,6 +300,27 @@
         "MetricName": "memory_bandwidth_write",
         "ScaleUnit": "1MB/s"
     },
+    {
+        "BriefDescription": "All reads to the local sub-numa cluster cache as a percentage of total memory read accesses",
+        "MetricExpr": "(L2_LINES_IN.ALL - (OCR.READS_TO_CORE.SNC_CACHE.HITM + OCR.READS_TO_CORE.SNC_CACHE.HIT_WITH_FWD + OCR.READS_TO_CORE.REMOTE_CACHE.SNOOP_FWD + OCR.READS_TO_CORE.REMOTE_MEMORY + OCR.READS_TO_CORE.L3_MISS_LOCAL)) / L2_LINES_IN.ALL",
+        "MetricName": "numa_percent_all_reads_to_local_cluster_cache",
+        "PublicDescription": "All reads to the local sub-numa cluster cache as a percentage of total memory read accesses. Includes demand and prefetch requests for data reads, code reads, read for ownerships (RFO), does not include LLC prefetches",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "All reads to the local sub-numa cluster memory as a percentage of total memory read accesses",
+        "MetricExpr": "OCR.READS_TO_CORE.L3_MISS_LOCAL / L2_LINES_IN.ALL",
+        "MetricName": "numa_percent_all_reads_to_local_cluster_memory",
+        "PublicDescription": "All reads to the local sub-numa cluster memory as a percentage of total memory read accesses. Includes demand and prefetch requests for data reads, code reads, read for ownerships (RFO), does not include LLC prefetches",
+        "ScaleUnit": "100%"
+    },
+    {
+        "BriefDescription": "All reads to a remote sub-numa cluster cache as a percentage of total memory read accesses",
+        "MetricExpr": "(OCR.READS_TO_CORE.SNC_CACHE.HIT_WITH_FWD + OCR.READS_TO_CORE.SNC_CACHE.HITM) / L2_LINES_IN.ALL",
+        "MetricName": "numa_percent_all_reads_to_remote_cluster_cache",
+        "PublicDescription": "All reads to a remote sub-numa cluster cache as a percentage of total memory read accesses. Includes demand and prefetch requests for data reads, code reads, read for ownerships (RFO), does not include LLC prefetches",
+        "ScaleUnit": "100%"
+    },
     {
         "BriefDescription": "Memory read that miss the last level cache (LLC) addressed to local DRAM as a percentage of total memory read accesses, does not include LLC prefetches",
         "MetricExpr": "(UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_LOCAL) / (UNC_CHA_TOR_INSERTS.IA_MISS_DRD_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_LOCAL + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_REMOTE + UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PREF_REMOTE)",
diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv
index b84035dc5b4f..96580ffda7bf 100644
--- a/tools/perf/pmu-events/arch/x86/mapfile.csv
+++ b/tools/perf/pmu-events/arch/x86/mapfile.csv
@@ -13,7 +13,7 @@ GenuineIntel-6-CF,v1.21,emeraldrapids,core
 GenuineIntel-6-5[CF],v13,goldmont,core
 GenuineIntel-6-7A,v1.01,goldmontplus,core
 GenuineIntel-6-B6,v1.11,grandridge,core
-GenuineIntel-6-A[DE],v1.16,graniterapids,core
+GenuineIntel-6-A[DE],v1.17,graniterapids,core
 GenuineIntel-6-(3C|45|46),v36,haswell,core
 GenuineIntel-6-3F,v29,haswellx,core
 GenuineIntel-6-7[DE],v1.24,icelake,core

From 6ac2011cd0c75f1de029942634c7daf1e31078f2 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 26 Feb 2026 09:59:32 -0800
Subject: [PATCH 012/131] perf vendor events intel: Update lunarlake events
 from 1.19 to 1.21

The updated events were published in:
https://github.com/intel/perfmon/commit/d6755a30419d02930889497741552309343bdb1e
https://github.com/intel/perfmon/commit/6c9f684ae1de6229511fd56d1196fdc2db242a41

Signed-off-by: Ian Rogers <irogers@google.com>
Reviewed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 .../pmu-events/arch/x86/lunarlake/cache.json  | 36 ++++++++++++++-----
 .../arch/x86/lunarlake/frontend.json          | 27 ++++++++++++++
 .../arch/x86/lunarlake/pipeline.json          | 10 +++---
 tools/perf/pmu-events/arch/x86/mapfile.csv    |  2 +-
 4 files changed, 61 insertions(+), 14 deletions(-)

diff --git a/tools/perf/pmu-events/arch/x86/lunarlake/cache.json b/tools/perf/pmu-events/arch/x86/lunarlake/cache.json
index 3d2616be8ec1..2db3e8a51fbd 100644
--- a/tools/perf/pmu-events/arch/x86/lunarlake/cache.json
+++ b/tools/perf/pmu-events/arch/x86/lunarlake/cache.json
@@ -550,6 +550,24 @@
         "UMask": "0x7e",
         "Unit": "cpu_atom"
     },
+    {
+        "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to an icache or itlb miss which missed all the caches.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x35",
+        "EventName": "MEM_BOUND_STALLS_IFETCH.LLC_MISS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x78",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to an icache or itlb miss which missed all the caches. Local DRAM, MMIO or other local memory type provides the data.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x35",
+        "EventName": "MEM_BOUND_STALLS_IFETCH.LLC_MISS_LOCALMEM",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x50",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to an L1 demand load miss.",
         "Counter": "0,1,2,3,4,5,6,7",
@@ -1088,7 +1106,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 128.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1100,7 +1118,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 16.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1112,7 +1130,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 256.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1124,7 +1142,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 32.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1136,7 +1154,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 4.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1148,7 +1166,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 512.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1160,7 +1178,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 64.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1172,7 +1190,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 8.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1274,7 +1292,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of  stores uops retired same as MEM_UOPS_RETIRED.ALL_STORES",
+        "BriefDescription": "Counts the number of stores uops retired.",
         "Counter": "0,1,2,3,4,5,6,7",
         "Data_LA": "1",
         "EventCode": "0xd0",
diff --git a/tools/perf/pmu-events/arch/x86/lunarlake/frontend.json b/tools/perf/pmu-events/arch/x86/lunarlake/frontend.json
index b21d602e9f1a..798eebf77436 100644
--- a/tools/perf/pmu-events/arch/x86/lunarlake/frontend.json
+++ b/tools/perf/pmu-events/arch/x86/lunarlake/frontend.json
@@ -424,6 +424,15 @@
         "UMask": "0x1",
         "Unit": "cpu_atom"
     },
+    {
+        "BriefDescription": "Counts the number of instructions retired that were tagged because empty issue slots were seen before the uop due to Instruction L1 cache miss, that missed in the L2 cache.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xc9",
+        "EventName": "FRONTEND_RETIRED_SOURCE.ICACHE_L2_MISS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0xe",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Counts the number of instructions retired that were tagged because empty issue slots were seen before the uop due to ITLB miss that hit in the second level TLB.",
         "Counter": "0,1,2,3,4,5,6,7",
@@ -500,6 +509,24 @@
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
+    {
+        "BriefDescription": "Cycles where a code fetch is stalled due to L1 instruction cache In use-full",
+        "Counter": "0,1,2,3,4,5,6,7,8,9",
+        "EventCode": "0x83",
+        "EventName": "ICACHE_TAG.STALLS_INUSE",
+        "SampleAfterValue": "200003",
+        "UMask": "0x10",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Cycles where a code fetch is stalled due to L1 instruction cache ISB-full",
+        "Counter": "0,1,2,3,4,5,6,7,8,9",
+        "EventCode": "0x83",
+        "EventName": "ICACHE_TAG.STALLS_ISB",
+        "SampleAfterValue": "200003",
+        "UMask": "0x8",
+        "Unit": "cpu_core"
+    },
     {
         "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering any Uop",
         "Counter": "0,1,2,3,4,5,6,7,8,9",
diff --git a/tools/perf/pmu-events/arch/x86/lunarlake/pipeline.json b/tools/perf/pmu-events/arch/x86/lunarlake/pipeline.json
index 97797f7b072e..d98723b3cd78 100644
--- a/tools/perf/pmu-events/arch/x86/lunarlake/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/lunarlake/pipeline.json
@@ -634,7 +634,7 @@
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles.",
+        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles. [This event is alias to CPU_CLK_UNHALTED.THREAD]",
         "Counter": "Fixed counter 1",
         "EventName": "CPU_CLK_UNHALTED.CORE",
         "SampleAfterValue": "2000003",
@@ -725,7 +725,7 @@
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles.",
+        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles. [This event is alias to CPU_CLK_UNHALTED.CORE]",
         "Counter": "Fixed counter 1",
         "EventName": "CPU_CLK_UNHALTED.THREAD",
         "SampleAfterValue": "2000003",
@@ -1530,8 +1530,9 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of accesses to KeyLocker cache.",
+        "BriefDescription": "This event is deprecated.",
         "Counter": "0,1,2,3,4,5,6,7",
+        "Deprecated": "1",
         "EventCode": "0xe1",
         "EventName": "MISC_RETIRED2.KEYLOCKER_ACCESS",
         "SampleAfterValue": "1000003",
@@ -1539,8 +1540,9 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of misses to KeyLocker cache.",
+        "BriefDescription": "This event is deprecated.",
         "Counter": "0,1,2,3,4,5,6,7",
+        "Deprecated": "1",
         "EventCode": "0xe1",
         "EventName": "MISC_RETIRED2.KEYLOCKER_MISS",
         "SampleAfterValue": "1000003",
diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv
index 96580ffda7bf..a2dde3faad5e 100644
--- a/tools/perf/pmu-events/arch/x86/mapfile.csv
+++ b/tools/perf/pmu-events/arch/x86/mapfile.csv
@@ -22,7 +22,7 @@ GenuineIntel-6-3A,v24,ivybridge,core
 GenuineIntel-6-3E,v24,ivytown,core
 GenuineIntel-6-2D,v24,jaketown,core
 GenuineIntel-6-(57|85),v16,knightslanding,core
-GenuineIntel-6-BD,v1.19,lunarlake,core
+GenuineIntel-6-BD,v1.21,lunarlake,core
 GenuineIntel-6-(AA|AC|B5),v1.18,meteorlake,core
 GenuineIntel-6-1[AEF],v4,nehalemep,core
 GenuineIntel-6-2E,v4,nehalemex,core

From 698fd9606ee685295313b929e64e3efd2cdd924e Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 26 Feb 2026 09:59:33 -0800
Subject: [PATCH 013/131] perf vendor events intel: Update meteorlake events
 from 1.18 to 1.20

The updated events were published in:
https://github.com/intel/perfmon/commit/2eebd8e2612a0655e82b88e1d2fab960315c025b
https://github.com/intel/perfmon/commit/81c4ce2c16f05b839d2c40e8cf183ed110357b73

Signed-off-by: Ian Rogers <irogers@google.com>
Reviewed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/pmu-events/arch/x86/mapfile.csv    |  2 +-
 .../pmu-events/arch/x86/meteorlake/cache.json | 67 ++++++++++++++++---
 .../arch/x86/meteorlake/frontend.json         | 18 +++++
 .../arch/x86/meteorlake/pipeline.json         | 46 +++++++++++--
 4 files changed, 116 insertions(+), 17 deletions(-)

diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv
index a2dde3faad5e..8d8fd8b08166 100644
--- a/tools/perf/pmu-events/arch/x86/mapfile.csv
+++ b/tools/perf/pmu-events/arch/x86/mapfile.csv
@@ -23,7 +23,7 @@ GenuineIntel-6-3E,v24,ivytown,core
 GenuineIntel-6-2D,v24,jaketown,core
 GenuineIntel-6-(57|85),v16,knightslanding,core
 GenuineIntel-6-BD,v1.21,lunarlake,core
-GenuineIntel-6-(AA|AC|B5),v1.18,meteorlake,core
+GenuineIntel-6-(AA|AC|B5),v1.20,meteorlake,core
 GenuineIntel-6-1[AEF],v4,nehalemep,core
 GenuineIntel-6-2E,v4,nehalemex,core
 GenuineIntel-6-CC,v1.02,pantherlake,core
diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/cache.json b/tools/perf/pmu-events/arch/x86/meteorlake/cache.json
index d3fc04b2ffbd..4c1220c19456 100644
--- a/tools/perf/pmu-events/arch/x86/meteorlake/cache.json
+++ b/tools/perf/pmu-events/arch/x86/meteorlake/cache.json
@@ -513,6 +513,15 @@
         "UMask": "0x6",
         "Unit": "cpu_atom"
     },
+    {
+        "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to an ICACHE or ITLB miss which hit in the LLC, no snoop was required. LLC provides the data. If the core has access to an L3 cache, an LLC hit refers to an L3 cache hit, otherwise it counts zeros.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x35",
+        "EventName": "MEM_BOUND_STALLS_IFETCH.LLC_HIT_NOSNOOP",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to an ICACHE or ITLB miss which missed all the caches. If the core has access to an L3 cache, an LLC miss refers to an L3 cache miss, otherwise it is an L2 cache miss.",
         "Counter": "0,1,2,3,4,5,6,7",
@@ -522,6 +531,15 @@
         "UMask": "0x78",
         "Unit": "cpu_atom"
     },
+    {
+        "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to an ICACHE or ITLB miss which missed all the caches. DRAM, MMIO or other LOCAL memory type provides the data. If the core has access to an L3 cache, an LLC miss refers to an L3 cache miss, otherwise it is an L2 cache miss.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x35",
+        "EventName": "MEM_BOUND_STALLS_IFETCH.LLC_MISS_LOCALMEM",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x50",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to an L1 demand load miss.",
         "Counter": "0,1,2,3,4,5,6,7",
@@ -559,6 +577,24 @@
         "UMask": "0x6",
         "Unit": "cpu_atom"
     },
+    {
+        "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to a demand load miss which hit in the LLC, no snoop was required. LLC provides the data. If the core has access to an L3 cache, an LLC hit refers to an L3 cache hit, otherwise it counts zeros.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS_LOAD.LLC_HIT_NOSNOOP",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to a demand load miss which hit in the LLC, a snoop was required, the snoop misses or the snoop hits but NO_FWD. LLC provides the data. If the core has access to an L3 cache, an LLC hit refers to an L3 cache hit, otherwise it counts zeros.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS_LOAD.LLC_HIT_SNOOP",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x4",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to a demand load miss which missed all the local caches. If the core has access to an L3 cache, an LLC miss refers to an L3 cache miss, otherwise it is an L2 cache miss.",
         "Counter": "0,1,2,3,4,5,6,7",
@@ -568,6 +604,15 @@
         "UMask": "0x78",
         "Unit": "cpu_atom"
     },
+    {
+        "BriefDescription": "Counts the number of unhalted cycles when the core is stalled to a demand load miss and the data was provided from an unknown source. If the core has access to an L3 cache, an LLC miss refers to an L3 cache miss, otherwise it is an L2 cache miss.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS_LOAD.LLC_MISS_LOCALMEM",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x50",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Counts the number of unhalted cycles when the core is stalled to a store buffer full condition",
         "Counter": "0,1,2,3,4,5,6,7",
@@ -969,7 +1014,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 1024. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -981,7 +1026,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 128. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -993,7 +1038,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 16. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1005,7 +1050,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 2048. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1017,7 +1062,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 256. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1029,7 +1074,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 32. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1041,7 +1086,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 4. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1053,7 +1098,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 512. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1065,7 +1110,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 64. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1077,7 +1122,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 8. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -1159,7 +1204,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of  stores uops retired same as MEM_UOPS_RETIRED.ALL_STORES",
+        "BriefDescription": "Counts the number of stores uops retired.",
         "Counter": "0,1,2,3,4,5,6,7",
         "Data_LA": "1",
         "EventCode": "0xd0",
diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/frontend.json b/tools/perf/pmu-events/arch/x86/meteorlake/frontend.json
index 6484834b1127..dcf8c8e720f3 100644
--- a/tools/perf/pmu-events/arch/x86/meteorlake/frontend.json
+++ b/tools/perf/pmu-events/arch/x86/meteorlake/frontend.json
@@ -430,6 +430,24 @@
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
+    {
+        "BriefDescription": "ICACHE_TAG.STALLS_INUSE",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x83",
+        "EventName": "ICACHE_TAG.STALLS_INUSE",
+        "SampleAfterValue": "200003",
+        "UMask": "0x10",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "ICACHE_TAG.STALLS_ISB",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x83",
+        "EventName": "ICACHE_TAG.STALLS_ISB",
+        "SampleAfterValue": "200003",
+        "UMask": "0x8",
+        "Unit": "cpu_core"
+    },
     {
         "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering any Uop",
         "Counter": "0,1,2,3",
diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/pipeline.json b/tools/perf/pmu-events/arch/x86/meteorlake/pipeline.json
index bfdaabe9377d..7662846745bd 100644
--- a/tools/perf/pmu-events/arch/x86/meteorlake/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/meteorlake/pipeline.json
@@ -517,7 +517,7 @@
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles",
+        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles. [This event is alias to CPU_CLK_UNHALTED.THREAD]",
         "Counter": "Fixed counter 1",
         "EventName": "CPU_CLK_UNHALTED.CORE",
         "SampleAfterValue": "2000003",
@@ -583,7 +583,7 @@
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Fixed Counter: Counts the number of unhalted reference clock cycles",
+        "BriefDescription": "Fixed Counter: Counts the number of unhalted reference clock cycles.",
         "Counter": "Fixed counter 2",
         "EventName": "CPU_CLK_UNHALTED.REF_TSC",
         "SampleAfterValue": "2000003",
@@ -620,7 +620,7 @@
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles",
+        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles. [This event is alias to CPU_CLK_UNHALTED.CORE]",
         "Counter": "Fixed counter 1",
         "EventName": "CPU_CLK_UNHALTED.THREAD",
         "SampleAfterValue": "2000003",
@@ -804,10 +804,10 @@
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Fixed Counter: Counts the number of instructions retired",
+        "BriefDescription": "Fixed Counter: Counts the number of instructions retired.",
         "Counter": "Fixed counter 0",
         "EventName": "INST_RETIRED.ANY",
-        "PublicDescription": "Fixed Counter: Counts the number of instructions retired Available PDIST counters: 32",
+        "PublicDescription": "Fixed Counter: Counts the number of instructions retired. Available PDIST counters: 32",
         "SampleAfterValue": "2000003",
         "UMask": "0x1",
         "Unit": "cpu_atom"
@@ -1207,6 +1207,42 @@
         "UMask": "0x20",
         "Unit": "cpu_core"
     },
+    {
+        "BriefDescription": "Counts the number of CLFLUSH, CLWB, and CLDEMOTE instructions retired.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xe0",
+        "EventName": "MISC_RETIRED1.CL_INST",
+        "SampleAfterValue": "1000003",
+        "UMask": "0xff",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of LFENCE instructions retired.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xe0",
+        "EventName": "MISC_RETIRED1.LFENCE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of accesses to KeyLocker cache.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xe1",
+        "EventName": "MISC_RETIRED2.KEYLOCKER_ACCESS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x10",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of misses to KeyLocker cache.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xe1",
+        "EventName": "MISC_RETIRED2.KEYLOCKER_MISS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x11",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Cycles stalled due to no store buffers available. (not including draining form sync).",
         "Counter": "0,1,2,3,4,5,6,7",

From 19967a42049166dbaa12fbe38d7c93a7148dd4ab Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 26 Feb 2026 09:59:34 -0800
Subject: [PATCH 014/131] perf vendor events intel: Update pantherlake events
 from 1.02 to 1.04

The updated events were published in:
https://github.com/intel/perfmon/commit/1f46fa264d202d57dade1d3fd5b58e79c4706147
https://github.com/intel/perfmon/commit/e49581aeb2903dde6fb1d187e9d412df58e01038

Signed-off-by: Ian Rogers <irogers@google.com>
Reviewed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/pmu-events/arch/x86/mapfile.csv    |   2 +-
 .../arch/x86/pantherlake/cache.json           | 159 +++++++++++++-
 .../arch/x86/pantherlake/floating-point.json  |  28 +++
 .../arch/x86/pantherlake/frontend.json        |  36 ++++
 .../arch/x86/pantherlake/memory.json          |  27 +++
 .../arch/x86/pantherlake/other.json           |  10 +
 .../arch/x86/pantherlake/pipeline.json        | 200 +++++++++++++++++-
 .../arch/x86/pantherlake/virtual-memory.json  |  30 +++
 8 files changed, 485 insertions(+), 7 deletions(-)

diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv
index 8d8fd8b08166..0839e21d4006 100644
--- a/tools/perf/pmu-events/arch/x86/mapfile.csv
+++ b/tools/perf/pmu-events/arch/x86/mapfile.csv
@@ -26,7 +26,7 @@ GenuineIntel-6-BD,v1.21,lunarlake,core
 GenuineIntel-6-(AA|AC|B5),v1.20,meteorlake,core
 GenuineIntel-6-1[AEF],v4,nehalemep,core
 GenuineIntel-6-2E,v4,nehalemex,core
-GenuineIntel-6-CC,v1.02,pantherlake,core
+GenuineIntel-6-CC,v1.04,pantherlake,core
 GenuineIntel-6-A7,v1.04,rocketlake,core
 GenuineIntel-6-2A,v19,sandybridge,core
 GenuineIntel-6-8F,v1.35,sapphirerapids,core
diff --git a/tools/perf/pmu-events/arch/x86/pantherlake/cache.json b/tools/perf/pmu-events/arch/x86/pantherlake/cache.json
index 91f5ab908926..e5323093eec0 100644
--- a/tools/perf/pmu-events/arch/x86/pantherlake/cache.json
+++ b/tools/perf/pmu-events/arch/x86/pantherlake/cache.json
@@ -149,6 +149,60 @@
         "UMask": "0xff",
         "Unit": "cpu_core"
     },
+    {
+        "BriefDescription": "Counts the number of L2 cache accesses from front door Demand Code Read requests. Does not include rejects or recycles, per core event.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x24",
+        "EventName": "L2_REQUEST.DEMAND_CODE_RD",
+        "SampleAfterValue": "1000003",
+        "UMask": "0xc4",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of L2 cache accesses from front door Demand Code Read requests that resulted in a Miss. Does not include rejects or recycles, per core event.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x24",
+        "EventName": "L2_REQUEST.DEMAND_CODE_RD_MISS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x44",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of L2 cache accesses from front door Demand Data Read requests. Does not include rejects or recycles, per core event.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x24",
+        "EventName": "L2_REQUEST.DEMAND_DATA_RD",
+        "SampleAfterValue": "1000003",
+        "UMask": "0xc1",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of L2 cache accesses from front door Demand Data Read requests that resulted in a Miss. Does not include rejects or recycles, per core event.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x24",
+        "EventName": "L2_REQUEST.DEMAND_DATA_RD_MISS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x41",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of L2 cache accesses from front door Demand RFO requests. Does not include rejects or recycles, per core event.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x24",
+        "EventName": "L2_REQUEST.DEMAND_RFO",
+        "SampleAfterValue": "1000003",
+        "UMask": "0xc2",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of L2 cache accesses from front door Demand RFO requests that resulted in a Miss. Does not include rejects or recycles, per core event.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x24",
+        "EventName": "L2_REQUEST.DEMAND_RFO_MISS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x42",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Counts the number of L2 cache accesses from front door requests that resulted in a Hit. Does not include rejects or recycles, per core event.",
         "Counter": "0,1,2,3,4,5,6,7",
@@ -158,6 +212,24 @@
         "UMask": "0x1bf",
         "Unit": "cpu_atom"
     },
+    {
+        "BriefDescription": "Counts the number of L2 cache accesses from front door Hardware Prefetch requests. Does not include rejects or recycles, per core event.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x24",
+        "EventName": "L2_REQUEST.HWPF",
+        "SampleAfterValue": "1000003",
+        "UMask": "0xc8",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of L2 cache accesses from front door requests that resulted in a Miss. Does not include rejects or recycles, per core event.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x24",
+        "EventName": "L2_REQUEST.MISS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x17f",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Read requests with true-miss in L2 cache [This event is alias to L2_RQSTS.MISS]",
         "Counter": "0,1,2,3,4,5,6,7,8,9",
@@ -365,6 +437,24 @@
         "UMask": "0x6",
         "Unit": "cpu_atom"
     },
+    {
+        "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to a demand load miss which hit in the LLC, no snoop was required. LLC provided data.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS_LOAD.LLC_HIT_NOSNOOP",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to a demand load miss which hit in the LLC, a snoop was required, the snoop misses  or the snoop hits but no fwd. LLC provides the data.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x34",
+        "EventName": "MEM_BOUND_STALLS_LOAD.LLC_HIT_SNOOP",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x4",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to a demand load miss which missed all the local caches.",
         "Counter": "0,1,2,3,4,5,6,7",
@@ -716,6 +806,16 @@
         "UMask": "0x20",
         "Unit": "cpu_core"
     },
+    {
+        "BriefDescription": "Counts the total number of load ops retired that miss the L3 cache.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xd3",
+        "EventName": "MEM_LOAD_UOPS_L3_MISS_RETIRED.ALL",
+        "PublicDescription": "Counts the total number of load ops retired that miss the L3 cache. Available PDIST counters: 0,1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0xff",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Counts the number of load ops retired that miss the L3 cache and hit in DRAM",
         "Counter": "0,1,2,3,4,5,6,7",
@@ -746,6 +846,26 @@
         "UMask": "0x8",
         "Unit": "cpu_atom"
     },
+    {
+        "BriefDescription": "Counts the number of load ops retired that hit in the L3 cache in which a snoop was required and no data was forwarded.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xd4",
+        "EventName": "MEM_LOAD_UOPS_MISC_RETIRED.L3_HIT_SNOOP_NO_FWD",
+        "PublicDescription": "Counts the number of load ops retired that hit in the L3 cache in which a snoop was required and no data was forwarded. Available PDIST counters: 0,1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x20",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of load ops retired that hit in the L3 cache in which a snoop was required and non-modified data was forwarded.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xd4",
+        "EventName": "MEM_LOAD_UOPS_MISC_RETIRED.L3_HIT_SNOOP_WITH_FWD",
+        "PublicDescription": "Counts the number of load ops retired that hit in the L3 cache in which a snoop was required and non-modified data was forwarded. Available PDIST counters: 0,1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x10",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Counts the number of load ops retired that hit the L1 data cache.",
         "Counter": "0,1,2,3,4,5,6,7",
@@ -796,6 +916,26 @@
         "UMask": "0x1c",
         "Unit": "cpu_atom"
     },
+    {
+        "BriefDescription": "Counts the number of load ops retired that hit in the L3 cache in which no snoop was required.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xd1",
+        "EventName": "MEM_LOAD_UOPS_RETIRED.L3_HIT_NO_SNOOP",
+        "PublicDescription": "Counts the number of load ops retired that hit in the L3 cache in which no snoop was required. Available PDIST counters: 0,1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x4",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of load ops retired that hit in the L3 cache in which a snoop was required and it hit and forwarded data, it hit and did not forward data, or it hit and the forwarded data was modified.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xd1",
+        "EventName": "MEM_LOAD_UOPS_RETIRED.L3_HIT_SNOOP_HIT",
+        "PublicDescription": "Counts the number of load ops retired that hit in the L3 cache in which a snoop was required and it hit and forwarded data, it hit and did not forward data, or it hit and the forwarded data was modified. Available PDIST counters: 0,1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x10",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Counts the number of cycles that uops are blocked for any of the following reasons:  load buffer, store buffer or RSV full.",
         "Counter": "0,1,2,3,4,5,6,7",
@@ -880,13 +1020,14 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 1024.",
         "Counter": "0,1,2,3,4,5,6,7",
+        "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_1024",
         "MSRIndex": "0x3F6",
         "MSRValue": "0x400",
-        "PublicDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled. Available PDIST counters: 0,1",
+        "PublicDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 1024. Available PDIST counters: 0,1",
         "SampleAfterValue": "1000003",
         "UMask": "0x5",
         "Unit": "cpu_atom"
@@ -894,6 +1035,7 @@
     {
         "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
         "Counter": "0,1,2,3,4,5,6,7",
+        "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_128",
         "MSRIndex": "0x3F6",
@@ -906,6 +1048,7 @@
     {
         "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
         "Counter": "0,1,2,3,4,5,6,7",
+        "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_16",
         "MSRIndex": "0x3F6",
@@ -916,13 +1059,14 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 2048.",
         "Counter": "0,1,2,3,4,5,6,7",
+        "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_2048",
         "MSRIndex": "0x3F6",
         "MSRValue": "0x800",
-        "PublicDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled. Available PDIST counters: 0,1",
+        "PublicDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 2048. Available PDIST counters: 0,1",
         "SampleAfterValue": "1000003",
         "UMask": "0x5",
         "Unit": "cpu_atom"
@@ -930,6 +1074,7 @@
     {
         "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
         "Counter": "0,1,2,3,4,5,6,7",
+        "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_256",
         "MSRIndex": "0x3F6",
@@ -942,6 +1087,7 @@
     {
         "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
         "Counter": "0,1,2,3,4,5,6,7",
+        "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_32",
         "MSRIndex": "0x3F6",
@@ -954,6 +1100,7 @@
     {
         "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
         "Counter": "0,1,2,3,4,5,6,7",
+        "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_4",
         "MSRIndex": "0x3F6",
@@ -966,6 +1113,7 @@
     {
         "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
         "Counter": "0,1,2,3,4,5,6,7",
+        "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_512",
         "MSRIndex": "0x3F6",
@@ -978,6 +1126,7 @@
     {
         "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
         "Counter": "0,1,2,3,4,5,6,7",
+        "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_64",
         "MSRIndex": "0x3F6",
@@ -990,6 +1139,7 @@
     {
         "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
         "Counter": "0,1,2,3,4,5,6,7",
+        "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_8",
         "MSRIndex": "0x3F6",
@@ -1072,6 +1222,7 @@
     {
         "BriefDescription": "Counts the number of  stores uops retired same as MEM_UOPS_RETIRED.ALL_STORES",
         "Counter": "0,1,2,3,4,5,6,7",
+        "Data_LA": "1",
         "EventCode": "0xd0",
         "EventName": "MEM_UOPS_RETIRED.STORE_LATENCY",
         "PublicDescription": "Counts the number of  stores uops retired same as MEM_UOPS_RETIRED.ALL_STORES Available PDIST counters: 0,1",
diff --git a/tools/perf/pmu-events/arch/x86/pantherlake/floating-point.json b/tools/perf/pmu-events/arch/x86/pantherlake/floating-point.json
index e306a45b22ee..77f6c9028d93 100644
--- a/tools/perf/pmu-events/arch/x86/pantherlake/floating-point.json
+++ b/tools/perf/pmu-events/arch/x86/pantherlake/floating-point.json
@@ -1,4 +1,14 @@
 [
+    {
+        "BriefDescription": "Counts the number of cycles when any of the floating point dividers are active.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "CounterMask": "1",
+        "EventCode": "0xcd",
+        "EventName": "ARITH.FPDIV_ACTIVE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Cycles when floating-point divide unit is busy executing divide or square root operations.",
         "Counter": "0,1,2,3,4,5,6,7,8,9",
@@ -10,6 +20,24 @@
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
+    {
+        "BriefDescription": "Counts the number of floating point dividers per cycle in the loop stage.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xcd",
+        "EventName": "ARITH.FPDIV_OCCUPANCY",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of floating point divider uops executed per cycle.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xcd",
+        "EventName": "ARITH.FPDIV_UOPS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x8",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Counts all microcode FP assists.",
         "Counter": "0,1,2,3,4,5,6,7,8,9",
diff --git a/tools/perf/pmu-events/arch/x86/pantherlake/frontend.json b/tools/perf/pmu-events/arch/x86/pantherlake/frontend.json
index d36faa683d3f..5e69b81742f5 100644
--- a/tools/perf/pmu-events/arch/x86/pantherlake/frontend.json
+++ b/tools/perf/pmu-events/arch/x86/pantherlake/frontend.json
@@ -422,6 +422,24 @@
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
+    {
+        "BriefDescription": "Cycles where a code fetch is stalled due to L1 instruction cache In use-full",
+        "Counter": "0,1,2,3,4,5,6,7,8,9",
+        "EventCode": "0x83",
+        "EventName": "ICACHE_TAG.STALLS_INUSE",
+        "SampleAfterValue": "200003",
+        "UMask": "0x10",
+        "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Cycles where a code fetch is stalled due to L1 instruction cache ISB-full",
+        "Counter": "0,1,2,3,4,5,6,7,8,9",
+        "EventCode": "0x83",
+        "EventName": "ICACHE_TAG.STALLS_ISB",
+        "SampleAfterValue": "200003",
+        "UMask": "0x8",
+        "Unit": "cpu_core"
+    },
     {
         "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering any Uop",
         "Counter": "0,1,2,3,4,5,6,7,8,9",
@@ -561,5 +579,23 @@
         "SampleAfterValue": "1000003",
         "UMask": "0x1",
         "Unit": "cpu_core"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that the micro-sequencer is busy.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xe7",
+        "EventName": "MS_DECODED.MS_BUSY",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x4",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of times entered into a ucode flow in the FEC.  Includes inserted flows due to front-end detected faults or assists.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xe7",
+        "EventName": "MS_DECODED.MS_ENTRY",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
     }
 ]
diff --git a/tools/perf/pmu-events/arch/x86/pantherlake/memory.json b/tools/perf/pmu-events/arch/x86/pantherlake/memory.json
index 3d31e620383d..4248cc101391 100644
--- a/tools/perf/pmu-events/arch/x86/pantherlake/memory.json
+++ b/tools/perf/pmu-events/arch/x86/pantherlake/memory.json
@@ -8,6 +8,24 @@
         "UMask": "0xf4",
         "Unit": "cpu_atom"
     },
+    {
+        "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer is stalled due to a DL1 miss.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x05",
+        "EventName": "LD_HEAD.L1_MISS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to a DL1 miss.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x05",
+        "EventName": "LD_HEAD.L1_MISS_AT_RET",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x81",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer is stalled due to request buffers full or lock in progress.",
         "Counter": "0,1,2,3,4,5,6,7",
@@ -17,6 +35,15 @@
         "UMask": "0x2",
         "Unit": "cpu_atom"
     },
+    {
+        "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer and retirement are both stalled due to request buffers full or lock in progress.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x05",
+        "EventName": "LD_HEAD.WCB_FULL_AT_RET",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x82",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Counts the number of machine clears due to memory ordering caused by a snoop from an external agent. Does not count internally generated machine clears such as those due to memory disambiguation.",
         "Counter": "0,1,2,3,4,5,6,7",
diff --git a/tools/perf/pmu-events/arch/x86/pantherlake/other.json b/tools/perf/pmu-events/arch/x86/pantherlake/other.json
index d49651d4f112..915c52f5abd1 100644
--- a/tools/perf/pmu-events/arch/x86/pantherlake/other.json
+++ b/tools/perf/pmu-events/arch/x86/pantherlake/other.json
@@ -30,6 +30,16 @@
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
+    {
+        "BriefDescription": "Counts the total number of BTCLEARS.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xe8",
+        "EventName": "PREDICTION.BTCLEAR",
+        "PublicDescription": "Counts the total number of BTCLEARS which occurs when the Branch Target Buffer (BTB) predicts a taken branch.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Cycles the uncore cannot take further requests",
         "Counter": "0,1,2,3,4,5,6,7,8,9",
diff --git a/tools/perf/pmu-events/arch/x86/pantherlake/pipeline.json b/tools/perf/pmu-events/arch/x86/pantherlake/pipeline.json
index fb87d30c403d..86009237df2f 100644
--- a/tools/perf/pmu-events/arch/x86/pantherlake/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/pantherlake/pipeline.json
@@ -1,4 +1,14 @@
 [
+    {
+        "BriefDescription": "Counts the number of cycles when any of the floating point or integer dividers are active.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "CounterMask": "1",
+        "EventCode": "0xcd",
+        "EventName": "ARITH.DIV_ACTIVE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x3",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Cycles when divide unit is busy executing divide or square root operations.",
         "Counter": "0,1,2,3,4,5,6,7,8,9",
@@ -10,6 +20,16 @@
         "UMask": "0x9",
         "Unit": "cpu_core"
     },
+    {
+        "BriefDescription": "Counts the number of cycles when any of the integer dividers are active.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "CounterMask": "1",
+        "EventCode": "0xcd",
+        "EventName": "ARITH.IDIV_ACTIVE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Cycles when integer divide unit is busy executing divide or square root operations.",
         "Counter": "0,1,2,3,4,5,6,7,8,9",
@@ -21,6 +41,24 @@
         "UMask": "0x8",
         "Unit": "cpu_core"
     },
+    {
+        "BriefDescription": "Counts number of active integer dividers per cycle.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xcd",
+        "EventName": "ARITH.IDIV_OCCUPANCY",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of integer divider uops executed per cycle.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xcd",
+        "EventName": "ARITH.IDIV_UOPS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x4",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Number of occurrences where a microcode assist is invoked by hardware.",
         "Counter": "0,1,2,3,4,5,6,7,8,9",
@@ -58,6 +96,38 @@
         "SampleAfterValue": "400009",
         "Unit": "cpu_core"
     },
+    {
+        "BriefDescription": "This event is deprecated. [This event is alias to BR_INST_RETIRED.NEAR_INDIRECT]",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "Deprecated": "1",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.ALL_NEAR_IND",
+        "PublicDescription": "This event is deprecated. [This event is alias to BR_INST_RETIRED.NEAR_INDIRECT] Available PDIST counters: 0,1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x50",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "This event is deprecated. [This event is alias to BR_INST_RETIRED.NEAR_INDIRECT_OR_RETURN]",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "Deprecated": "1",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.ALL_NEAR_IND_OR_RET",
+        "PublicDescription": "This event is deprecated. [This event is alias to BR_INST_RETIRED.NEAR_INDIRECT_OR_RETURN] Available PDIST counters: 0,1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x58",
+        "Unit": "cpu_atom"
+    },
+    {
+        "BriefDescription": "Counts the number of conditional branch instructions retired.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.COND",
+        "PublicDescription": "Counts the number of conditional branch instructions retired. Available PDIST counters: 0,1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x7",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Conditional branch instructions retired.",
         "Counter": "0,1,2,3,4,5,6,7,8,9",
@@ -88,6 +158,16 @@
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
+    {
+        "BriefDescription": "Counts the number of taken conditional branch instructions retired.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.COND_TAKEN",
+        "PublicDescription": "Counts the number of taken conditional branch instructions retired. Available PDIST counters: 0,1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x3",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Taken conditional branch instructions retired.",
         "Counter": "0,1,2,3,4,5,6,7,8,9",
@@ -98,6 +178,16 @@
         "UMask": "0x3",
         "Unit": "cpu_core"
     },
+    {
+        "BriefDescription": "Counts the number of taken backward conditional branch instructions retired.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.COND_TAKEN_BWD",
+        "PublicDescription": "Counts the number of taken backward conditional branch instructions retired. Available PDIST counters: 0,1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Taken backward conditional branch instructions retired.",
         "Counter": "0,1,2,3,4,5,6,7,8,9",
@@ -108,6 +198,16 @@
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
+    {
+        "BriefDescription": "Counts the number of taken forward conditional branch instructions retired.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.COND_TAKEN_FWD",
+        "PublicDescription": "Counts the number of taken forward conditional branch instructions retired. Available PDIST counters: 0,1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Taken forward conditional branch instructions retired.",
         "Counter": "0,1,2,3,4,5,6,7,8,9",
@@ -178,6 +278,16 @@
         "UMask": "0x80",
         "Unit": "cpu_core"
     },
+    {
+        "BriefDescription": "Counts the number of near indirect JMP and near indirect CALL branch instructions retired. [This event is alias to BR_INST_RETIRED.ALL_NEAR_IND]",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.NEAR_INDIRECT",
+        "PublicDescription": "Counts the number of near indirect JMP and near indirect CALL branch instructions retired. [This event is alias to BR_INST_RETIRED.ALL_NEAR_IND] Available PDIST counters: 0,1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x50",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Indirect near branch instructions retired (excluding returns) [This event is alias to BR_INST_RETIRED.INDIRECT]",
         "Counter": "0,1,2,3,4,5,6,7,8,9",
@@ -208,6 +318,16 @@
         "UMask": "0x40",
         "Unit": "cpu_core"
     },
+    {
+        "BriefDescription": "Counts the number of near indirect JMP, near indirect CALL, and RET branch instructions retired. [This event is alias to BR_INST_RETIRED.ALL_NEAR_IND_OR_RET]",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xc4",
+        "EventName": "BR_INST_RETIRED.NEAR_INDIRECT_OR_RETURN",
+        "PublicDescription": "Counts the number of near indirect JMP, near indirect CALL, and RET branch instructions retired. [This event is alias to BR_INST_RETIRED.ALL_NEAR_IND_OR_RET] Available PDIST counters: 0,1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x58",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "This event is deprecated. [This event is alias to BR_INST_RETIRED.NEAR_INDIRECT_CALL]",
         "Counter": "0,1,2,3,4,5,6,7,8,9",
@@ -283,7 +403,7 @@
         "Unit": "cpu_atom"
     },
     {
-        "BriefDescription": "Taken branch instructions retired.",
+        "BriefDescription": "Near Taken branch instructions retired.",
         "Counter": "0,1,2,3,4,5,6,7,8,9",
         "EventCode": "0xc4",
         "EventName": "BR_INST_RETIRED.NEAR_TAKEN",
@@ -755,7 +875,7 @@
         "Unit": "cpu_core"
     },
     {
-        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles.",
+        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles. [This event is alias to CPU_CLK_UNHALTED.THREAD]",
         "Counter": "Fixed counter 1",
         "EventName": "CPU_CLK_UNHALTED.CORE",
         "SampleAfterValue": "2000003",
@@ -1549,6 +1669,16 @@
         "UMask": "0x1",
         "Unit": "cpu_core"
     },
+    {
+        "BriefDescription": "Counts the number of CLFLUSH, CLWB, and CLDEMOTE instructions retired.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xe0",
+        "EventName": "MISC_RETIRED1.CL_INST",
+        "PublicDescription": "Counts the number of CLFLUSH, CLWB, and CLDEMOTE instructions retired. Available PDIST counters: 0,1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0xff",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Counts the number of LFENCE instructions retired.",
         "Counter": "0,1,2,3,4,5,6,7",
@@ -1620,6 +1750,15 @@
         "UMask": "0x4",
         "Unit": "cpu_atom"
     },
+    {
+        "BriefDescription": "Counts the number issue slots not consumed  due to a  color request for an FCW or MXCSR control register when all 4 colors (copies) are already in use.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x75",
+        "EventName": "SERIALIZATION.COLOR_STALLS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x8",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Counts the number of issue slots where no uop could issue due to an IQ scoreboard that stalls allocation until a specified older uop retires or (in the case of jump scoreboard) executes. Commonly executed instructions with IQ scoreboards include LFENCE and MFENCE.",
         "Counter": "0,1,2,3,4,5,6,7",
@@ -1732,6 +1871,15 @@
         "UMask": "0x8",
         "Unit": "cpu_atom"
     },
+    {
+        "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a machine clear (nuke) of any kind including memory ordering and memory disambiguation.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x73",
+        "EventName": "TOPDOWN_BAD_SPECULATION.MACHINE_CLEARS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x3",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to Branch Mispredict",
         "Counter": "0,1,2,3,4,5,6,7",
@@ -1795,6 +1943,15 @@
         "UMask": "0x2",
         "Unit": "cpu_atom"
     },
+    {
+        "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to ROB full",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x74",
+        "EventName": "TOPDOWN_BE_BOUND.REORDER_BUFFER",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x40",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Counts the number of issue slots every cycle that were not consumed by the backend due to iq/jeu scoreboards or ms scb",
         "Counter": "0,1,2,3,4,5,6,7",
@@ -2076,6 +2233,15 @@
         "UMask": "0x10",
         "Unit": "cpu_core"
     },
+    {
+        "BriefDescription": "Counts the number of uops issued by the front end every cycle.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x0e",
+        "EventName": "UOPS_ISSUED.ANY",
+        "PublicDescription": "Counts the number of uops issued by the front end every cycle. When 4-uops are requested and only 2-uops are delivered, the event counts 2. Uops_issued correlates to the number of ROB entries. If uop takes 2 ROB slots it counts as 2 uops_issued.",
+        "SampleAfterValue": "1000003",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Uops that RAT issues to RS",
         "Counter": "0,1,2,3,4,5,6,7,8,9",
@@ -2107,6 +2273,16 @@
         "UMask": "0x2",
         "Unit": "cpu_core"
     },
+    {
+        "BriefDescription": "Counts the number of uops retired that are the last uop of a macro-instruction.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xc2",
+        "EventName": "UOPS_RETIRED.EOM",
+        "PublicDescription": "Counts the number of uops retired that are the last uop of a macro-instruction.   EOM uops indicate the 'end of a macro-instruction' and play a crucial role in the processor's control flow and recovery mechanisms.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x1",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Retired uops except the last uop of each instruction.",
         "Counter": "0,1,2,3,4,5,6,7,8,9",
@@ -2127,6 +2303,16 @@
         "UMask": "0x80",
         "Unit": "cpu_atom"
     },
+    {
+        "BriefDescription": "Counts the number of uops retired that originated from a loop stream detector.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xc2",
+        "EventName": "UOPS_RETIRED.LSD",
+        "PublicDescription": "Counts the number of uops retired that originated from a loop stream detector. Available PDIST counters: 0,1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x20",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Counts the number of uops that are from the complex flows issued by the micro-sequencer (MS).  This includes uops from flows due to complex instructions, faults, assists, and inserted flows.",
         "Counter": "0,1,2,3,4,5,6,7",
@@ -2161,6 +2347,16 @@
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
+    {
+        "BriefDescription": "UOPS_RETIRED.NANO_CODE",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xc2",
+        "EventName": "UOPS_RETIRED.NANO_CODE",
+        "PublicDescription": "UOPS_RETIRED.NANO_CODE Available PDIST counters: 0,1",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x8",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "This event counts a subset of the Topdown Slots event that are utilized by operations that eventually get retired (committed) by the processor pipeline. Usually, this event positively correlates with higher performance  for example, as measured by the instructions-per-cycle metric.",
         "Counter": "0,1,2,3,4,5,6,7,8,9",
diff --git a/tools/perf/pmu-events/arch/x86/pantherlake/virtual-memory.json b/tools/perf/pmu-events/arch/x86/pantherlake/virtual-memory.json
index 8d56c16b2a39..8f3dd36707dc 100644
--- a/tools/perf/pmu-events/arch/x86/pantherlake/virtual-memory.json
+++ b/tools/perf/pmu-events/arch/x86/pantherlake/virtual-memory.json
@@ -78,6 +78,16 @@
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
+    {
+        "BriefDescription": "Counts the number of page walks completed due to load DTLB misses to a 4K page.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x08",
+        "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_4K",
+        "PublicDescription": "Counts the number of page walks completed due to loads (including SW prefetches) whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 4K pages. Includes page walks that page fault.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Page walks completed due to a demand data load to a 4K page.",
         "Counter": "0,1,2,3,4,5,6,7,8,9",
@@ -178,6 +188,16 @@
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
+    {
+        "BriefDescription": "Counts the number of page walks completed due to store DTLB misses to a 4K page.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x49",
+        "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_4K",
+        "PublicDescription": "Counts the number of page walks completed due to stores whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 4K pages.  Includes page walks that page fault.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Page walks completed due to a demand data store to a 4K page.",
         "Counter": "0,1,2,3,4,5,6,7,8,9",
@@ -267,6 +287,16 @@
         "UMask": "0x4",
         "Unit": "cpu_core"
     },
+    {
+        "BriefDescription": "Counts the number of page walks completed due to instruction fetch misses to a 4K page.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0x85",
+        "EventName": "ITLB_MISSES.WALK_COMPLETED_4K",
+        "PublicDescription": "Counts the number of page walks completed due to instruction fetches whose address translations missed in all Translation Lookaside Buffer (TLB) levels and were mapped to 4K pages.  Includes page walks that page fault.",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2",
+        "Unit": "cpu_atom"
+    },
     {
         "BriefDescription": "Code miss in all TLB levels causes a page walk that completes. (4K)",
         "Counter": "0,1,2,3,4,5,6,7,8,9",

From c592a539172664afa1240ea324d6117dcb461ba3 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 26 Feb 2026 09:59:35 -0800
Subject: [PATCH 015/131] perf vendor events intel: Update sapphirerapids
 events from 1.35 to 1.36

The updated events were published in:
https://github.com/intel/perfmon/commit/bda7f1e1839e2f9ea1ac45da338e6fe5ca6fdbb0

Signed-off-by: Ian Rogers <irogers@google.com>
Reviewed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/pmu-events/arch/x86/mapfile.csv      |  2 +-
 .../arch/x86/sapphirerapids/cache.json          |  4 ++--
 .../arch/x86/sapphirerapids/frontend.json       | 16 ++++++++++++++++
 .../arch/x86/sapphirerapids/uncore-cache.json   |  4 ++--
 .../arch/x86/sapphirerapids/uncore-io.json      | 17 +++++++++--------
 5 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv
index 0839e21d4006..8ef03af9f150 100644
--- a/tools/perf/pmu-events/arch/x86/mapfile.csv
+++ b/tools/perf/pmu-events/arch/x86/mapfile.csv
@@ -29,7 +29,7 @@ GenuineIntel-6-2E,v4,nehalemex,core
 GenuineIntel-6-CC,v1.04,pantherlake,core
 GenuineIntel-6-A7,v1.04,rocketlake,core
 GenuineIntel-6-2A,v19,sandybridge,core
-GenuineIntel-6-8F,v1.35,sapphirerapids,core
+GenuineIntel-6-8F,v1.36,sapphirerapids,core
 GenuineIntel-6-AF,v1.13,sierraforest,core
 GenuineIntel-6-(37|4A|4C|4D|5A),v15,silvermont,core
 GenuineIntel-6-(4E|5E|8E|9E|A5|A6),v59,skylake,core
diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/cache.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/cache.json
index c66324d41a89..373b26c84448 100644
--- a/tools/perf/pmu-events/arch/x86/sapphirerapids/cache.json
+++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/cache.json
@@ -514,7 +514,7 @@
         "EventCode": "0xd3",
         "EventName": "MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM",
         "PublicDescription": "MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM Available PDIST counters: 0",
-        "SampleAfterValue": "1000003",
+        "SampleAfterValue": "100007",
         "UMask": "0x2"
     },
     {
@@ -534,7 +534,7 @@
         "EventCode": "0xd3",
         "EventName": "MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM",
         "PublicDescription": "MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM Available PDIST counters: 0",
-        "SampleAfterValue": "1000003",
+        "SampleAfterValue": "100007",
         "UMask": "0x4"
     },
     {
diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/frontend.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/frontend.json
index 793c486ffabe..e51f5e85ffd1 100644
--- a/tools/perf/pmu-events/arch/x86/sapphirerapids/frontend.json
+++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/frontend.json
@@ -271,6 +271,22 @@
         "SampleAfterValue": "200003",
         "UMask": "0x4"
     },
+    {
+        "BriefDescription": "ICACHE_TAG.STALLS_INUSE",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x83",
+        "EventName": "ICACHE_TAG.STALLS_INUSE",
+        "SampleAfterValue": "200003",
+        "UMask": "0x10"
+    },
+    {
+        "BriefDescription": "ICACHE_TAG.STALLS_ISB",
+        "Counter": "0,1,2,3",
+        "EventCode": "0x83",
+        "EventName": "ICACHE_TAG.STALLS_ISB",
+        "SampleAfterValue": "200003",
+        "UMask": "0x8"
+    },
     {
         "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering any Uop",
         "Counter": "0,1,2,3",
diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-cache.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-cache.json
index 1bdda3c3ccbf..59f6fd2c7a8f 100644
--- a/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-cache.json
+++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-cache.json
@@ -3501,7 +3501,7 @@
         "EventName": "UNC_CHA_SNOOP_RESP.RSPIFWD",
         "Experimental": "1",
         "PerPkg": "1",
-        "PublicDescription": "Counts when a a transaction with the opcode type RspIFwd Snoop Response was received which indicates a remote caching agent forwarded the data and the requesting agent is able to acquire the data in E (Exclusive) or M (modified) states.  This is commonly returned with RFO (the Read for Ownership issued before a write) transactions.  The snoop could have either been to a cacheline in the M,E,F (Modified, Exclusive or Forward)  states.",
+        "PublicDescription": "Counts when a transaction with the opcode type RspIFwd Snoop Response was received which indicates a remote caching agent forwarded the data and the requesting agent is able to acquire the data in E (Exclusive) or M (modified) states.  This is commonly returned with RFO (the Read for Ownership issued before a write) transactions.  The snoop could have either been to a cacheline in the M,E,F (Modified, Exclusive or Forward)  states.",
         "UMask": "0x4",
         "Unit": "CHA"
     },
@@ -3523,7 +3523,7 @@
         "EventName": "UNC_CHA_SNOOP_RESP.RSPSFWD",
         "Experimental": "1",
         "PerPkg": "1",
-        "PublicDescription": "Counts when a a transaction with the opcode type RspSFwd Snoop Response was received which indicates a remote caching agent forwarded the data but held on to its current copy.  This is common for data and code reads that hit in a remote socket in E (Exclusive) or F (Forward) state.",
+        "PublicDescription": "Counts when a transaction with the opcode type RspSFwd Snoop Response was received which indicates a remote caching agent forwarded the data but held on to its current copy.  This is common for data and code reads that hit in a remote socket in E (Exclusive) or F (Forward) state.",
         "UMask": "0x8",
         "Unit": "CHA"
     },
diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-io.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-io.json
index dac7e6c50f31..45675a1099e2 100644
--- a/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-io.json
+++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-io.json
@@ -303,6 +303,7 @@
         "Experimental": "1",
         "FCMask": "0x07",
         "PerPkg": "1",
+        "PortMask": "0xff",
         "UMask": "0xff",
         "Unit": "IIO"
     },
@@ -314,7 +315,7 @@
         "Experimental": "1",
         "FCMask": "0x07",
         "PerPkg": "1",
-        "PortMask": "0x0000",
+        "PortMask": "0x01",
         "PublicDescription": "x16 card plugged in to stack, Or x8 card plugged in to Lane 0/1, Or x4 card is plugged in to slot 0",
         "UMask": "0x1",
         "Unit": "IIO"
@@ -327,7 +328,7 @@
         "Experimental": "1",
         "FCMask": "0x07",
         "PerPkg": "1",
-        "PortMask": "0x0000",
+        "PortMask": "0x02",
         "PublicDescription": "x4 card is plugged in to slot 1",
         "UMask": "0x2",
         "Unit": "IIO"
@@ -340,7 +341,7 @@
         "Experimental": "1",
         "FCMask": "0x07",
         "PerPkg": "1",
-        "PortMask": "0x0000",
+        "PortMask": "0x04",
         "PublicDescription": "x8 card plugged in to Lane 2/3, Or x4 card is plugged in to slot 1",
         "UMask": "0x4",
         "Unit": "IIO"
@@ -353,7 +354,7 @@
         "Experimental": "1",
         "FCMask": "0x07",
         "PerPkg": "1",
-        "PortMask": "0x0000",
+        "PortMask": "0x08",
         "PublicDescription": "x4 card is plugged in to slot 3",
         "UMask": "0x8",
         "Unit": "IIO"
@@ -366,7 +367,7 @@
         "Experimental": "1",
         "FCMask": "0x07",
         "PerPkg": "1",
-        "PortMask": "0x0000",
+        "PortMask": "0x10",
         "PublicDescription": "x16 card plugged in to stack, Or x8 card plugged in to Lane 0/1, Or x4 card is plugged in to slot 0",
         "UMask": "0x10",
         "Unit": "IIO"
@@ -379,7 +380,7 @@
         "Experimental": "1",
         "FCMask": "0x07",
         "PerPkg": "1",
-        "PortMask": "0x0000",
+        "PortMask": "0x20",
         "PublicDescription": "x4 card is plugged in to slot 1",
         "UMask": "0x20",
         "Unit": "IIO"
@@ -392,7 +393,7 @@
         "Experimental": "1",
         "FCMask": "0x07",
         "PerPkg": "1",
-        "PortMask": "0x0000",
+        "PortMask": "0x40",
         "PublicDescription": "x8 card plugged in to Lane 2/3, Or x4 card is plugged in to slot 1",
         "UMask": "0x40",
         "Unit": "IIO"
@@ -405,7 +406,7 @@
         "Experimental": "1",
         "FCMask": "0x07",
         "PerPkg": "1",
-        "PortMask": "0x0000",
+        "PortMask": "0x80",
         "PublicDescription": "x4 card is plugged in to slot 3",
         "UMask": "0x80",
         "Unit": "IIO"

From 977000589d30f8d4f0777893711199350d474363 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 26 Feb 2026 09:59:36 -0800
Subject: [PATCH 016/131] perf vendor events intel: Update sierraforest events
 from 1.13 to 1.15

The updated events were published in:
https://github.com/intel/perfmon/commit/996bacad8f144e675b32f0096b9fe6813380695c
https://github.com/intel/perfmon/commit/93b6ef08ca9b01788458e8f5a0e7cbb716715b7c

Signed-off-by: Ian Rogers <irogers@google.com>
Reviewed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/pmu-events/arch/x86/mapfile.csv    |  2 +-
 .../arch/x86/sierraforest/cache.json          | 22 +++++-----
 .../arch/x86/sierraforest/pipeline.json       | 42 ++++++++++++++++---
 3 files changed, 49 insertions(+), 17 deletions(-)

diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv
index 8ef03af9f150..8a9e1735e21e 100644
--- a/tools/perf/pmu-events/arch/x86/mapfile.csv
+++ b/tools/perf/pmu-events/arch/x86/mapfile.csv
@@ -30,7 +30,7 @@ GenuineIntel-6-CC,v1.04,pantherlake,core
 GenuineIntel-6-A7,v1.04,rocketlake,core
 GenuineIntel-6-2A,v19,sandybridge,core
 GenuineIntel-6-8F,v1.36,sapphirerapids,core
-GenuineIntel-6-AF,v1.13,sierraforest,core
+GenuineIntel-6-AF,v1.15,sierraforest,core
 GenuineIntel-6-(37|4A|4C|4D|5A),v15,silvermont,core
 GenuineIntel-6-(4E|5E|8E|9E|A5|A6),v59,skylake,core
 GenuineIntel-6-55-[01234],v1.37,skylakex,core
diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/cache.json b/tools/perf/pmu-events/arch/x86/sierraforest/cache.json
index de0e7661a52d..168f43557a0e 100644
--- a/tools/perf/pmu-events/arch/x86/sierraforest/cache.json
+++ b/tools/perf/pmu-events/arch/x86/sierraforest/cache.json
@@ -326,7 +326,7 @@
         "UMask": "0x82"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 1024. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -337,7 +337,7 @@
         "UMask": "0x5"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 128. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -348,7 +348,7 @@
         "UMask": "0x5"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 16. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -359,7 +359,7 @@
         "UMask": "0x5"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 2048. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -370,7 +370,7 @@
         "UMask": "0x5"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 256. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -381,7 +381,7 @@
         "UMask": "0x5"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 32. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -392,7 +392,7 @@
         "UMask": "0x5"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 4. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -403,7 +403,7 @@
         "UMask": "0x5"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 512. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -414,7 +414,7 @@
         "UMask": "0x5"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 64. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -425,7 +425,7 @@
         "UMask": "0x5"
     },
     {
-        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.",
+        "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold of 8. Only counts with PEBS enabled.",
         "Counter": "0,1",
         "Data_LA": "1",
         "EventCode": "0xd0",
@@ -499,7 +499,7 @@
         "UMask": "0x12"
     },
     {
-        "BriefDescription": "Counts the number of  stores uops retired same as MEM_UOPS_RETIRED.ALL_STORES",
+        "BriefDescription": "Counts the number of stores uops retired.",
         "Counter": "0,1,2,3,4,5,6,7",
         "Data_LA": "1",
         "EventCode": "0xd0",
diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/pipeline.json b/tools/perf/pmu-events/arch/x86/sierraforest/pipeline.json
index 70af13143024..cf67ff6135e0 100644
--- a/tools/perf/pmu-events/arch/x86/sierraforest/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/sierraforest/pipeline.json
@@ -186,7 +186,7 @@
         "UMask": "0xf7"
     },
     {
-        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles",
+        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles. [This event is alias to CPU_CLK_UNHALTED.THREAD]",
         "Counter": "Fixed counter 1",
         "EventName": "CPU_CLK_UNHALTED.CORE",
         "SampleAfterValue": "2000003",
@@ -200,7 +200,7 @@
         "SampleAfterValue": "2000003"
     },
     {
-        "BriefDescription": "Fixed Counter: Counts the number of unhalted reference clock cycles",
+        "BriefDescription": "Fixed Counter: Counts the number of unhalted reference clock cycles.",
         "Counter": "Fixed counter 2",
         "EventName": "CPU_CLK_UNHALTED.REF_TSC",
         "SampleAfterValue": "2000003",
@@ -216,7 +216,7 @@
         "UMask": "0x1"
     },
     {
-        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles",
+        "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles. [This event is alias to CPU_CLK_UNHALTED.CORE]",
         "Counter": "Fixed counter 1",
         "EventName": "CPU_CLK_UNHALTED.THREAD",
         "SampleAfterValue": "2000003",
@@ -230,10 +230,10 @@
         "SampleAfterValue": "2000003"
     },
     {
-        "BriefDescription": "Fixed Counter: Counts the number of instructions retired",
+        "BriefDescription": "Fixed Counter: Counts the number of instructions retired.",
         "Counter": "Fixed counter 0",
         "EventName": "INST_RETIRED.ANY",
-        "PublicDescription": "Fixed Counter: Counts the number of instructions retired Available PDIST counters: 32",
+        "PublicDescription": "Fixed Counter: Counts the number of instructions retired. Available PDIST counters: 32",
         "SampleAfterValue": "2000003",
         "UMask": "0x1"
     },
@@ -309,6 +309,38 @@
         "SampleAfterValue": "1000003",
         "UMask": "0x1"
     },
+    {
+        "BriefDescription": "Counts the number of CLFLUSH, CLWB, and CLDEMOTE instructions retired.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xe0",
+        "EventName": "MISC_RETIRED1.CL_INST",
+        "SampleAfterValue": "1000003",
+        "UMask": "0xff"
+    },
+    {
+        "BriefDescription": "Counts the number of LFENCE instructions retired.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xe0",
+        "EventName": "MISC_RETIRED1.LFENCE",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x2"
+    },
+    {
+        "BriefDescription": "Counts the number of accesses to KeyLocker cache.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xe1",
+        "EventName": "MISC_RETIRED2.KEYLOCKER_ACCESS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x10"
+    },
+    {
+        "BriefDescription": "Counts the number of misses to KeyLocker cache.",
+        "Counter": "0,1,2,3,4,5,6,7",
+        "EventCode": "0xe1",
+        "EventName": "MISC_RETIRED2.KEYLOCKER_MISS",
+        "SampleAfterValue": "1000003",
+        "UMask": "0x11"
+    },
     {
         "BriefDescription": "Counts the number of issue slots in a UMWAIT or TPAUSE instruction where no uop issues due to the instruction putting the CPU into the C0.1 activity state.",
         "Counter": "0,1,2,3,4,5,6,7",

From 06ec44c2aa2ef15fd56f9808b6cf7495e1fbd8ec Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@arm.com>
Date: Sun, 1 Mar 2026 17:43:25 +0000
Subject: [PATCH 017/131] perf kvm stat: Fix relative paths for including
 headers

Add an extra "../" to the relative paths so that the uAPI headers
provided by tools can be found correctly.

Fixes: a724a8fce5e2 ("perf kvm stat: Fix build error")
Reported-by: Namhyung Kim <namhyung@kernel.org>
Suggested-by: Ian Rogers <irogers@google.com>
Signed-off-by: Leo Yan <leo.yan@arm.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/kvm-stat-arch/kvm-stat-x86.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/perf/util/kvm-stat-arch/kvm-stat-x86.c b/tools/perf/util/kvm-stat-arch/kvm-stat-x86.c
index 43275d25b6cb..0f626db3a439 100644
--- a/tools/perf/util/kvm-stat-arch/kvm-stat-x86.c
+++ b/tools/perf/util/kvm-stat-arch/kvm-stat-x86.c
@@ -4,9 +4,9 @@
 #include "../kvm-stat.h"
 #include "../evsel.h"
 #include "../env.h"
-#include "../../arch/x86/include/uapi/asm/svm.h"
-#include "../../arch/x86/include/uapi/asm/vmx.h"
-#include "../../arch/x86/include/uapi/asm/kvm.h"
+#include "../../../arch/x86/include/uapi/asm/svm.h"
+#include "../../../arch/x86/include/uapi/asm/vmx.h"
+#include "../../../arch/x86/include/uapi/asm/kvm.h"
 #include <subcmd/parse-options.h>
 
 define_exit_reasons_table(vmx_exit_reasons, VMX_EXIT_REASONS);

From d05073adda0f047e9b2115a2932bcb2797eab238 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 2 Mar 2026 15:45:15 -0800
Subject: [PATCH 018/131] perf trace: Avoid an ERR_PTR in syscall_stats

hashmap__new may return an ERR_PTR and previously this would be
assigned to syscall_stats meaning all use of syscall_stats needs to
test for NULL (uninitialized) or an ERR_PTR. Given the only reason
hashmap__new can fail is ENOMEM, just use NULL to indicate the
allocation failure and avoid the code having to test for NULL and
IS_ERR.

Fixes: 96f202eab813 (perf trace: Fix IS_ERR() vs NULL check bug)
Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/builtin-trace.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 295b272c6c29..7ff85fa90d98 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -1565,7 +1565,9 @@ static bool syscall_id_equal(long key1, long key2, void *ctx __maybe_unused)
 
 static struct hashmap *alloc_syscall_stats(void)
 {
-	return hashmap__new(syscall_id_hash, syscall_id_equal, NULL);
+	struct hashmap *result = hashmap__new(syscall_id_hash, syscall_id_equal, NULL);
+
+	return IS_ERR(result) ? NULL : result;
 }
 
 static void delete_syscall_stats(struct hashmap *syscall_stats)
@@ -1573,7 +1575,7 @@ static void delete_syscall_stats(struct hashmap *syscall_stats)
 	struct hashmap_entry *pos;
 	size_t bkt;
 
-	if (IS_ERR(syscall_stats))
+	if (!syscall_stats)
 		return;
 
 	hashmap__for_each_entry(syscall_stats, pos, bkt)
@@ -1589,7 +1591,7 @@ static struct thread_trace *thread_trace__new(struct trace *trace)
 		ttrace->files.max = -1;
 		if (trace->summary) {
 			ttrace->syscall_stats = alloc_syscall_stats();
-			if (IS_ERR(ttrace->syscall_stats))
+			if (!ttrace->syscall_stats)
 				zfree(&ttrace);
 		}
 	}
@@ -4464,7 +4466,7 @@ create_maps:
 
 	if (trace->summary_mode == SUMMARY__BY_TOTAL && !trace->summary_bpf) {
 		trace->syscall_stats = alloc_syscall_stats();
-		if (IS_ERR(trace->syscall_stats))
+		if (!trace->syscall_stats)
 			goto out_delete_evlist;
 	}
 
@@ -4771,7 +4773,7 @@ static int trace__replay(struct trace *trace)
 
 	if (trace->summary_mode == SUMMARY__BY_TOTAL) {
 		trace->syscall_stats = alloc_syscall_stats();
-		if (IS_ERR(trace->syscall_stats))
+		if (!trace->syscall_stats)
 			goto out;
 	}
 

From 895306e3c881ae8a3227a31bf4e64865ad6a534f Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 3 Mar 2026 10:52:16 -0800
Subject: [PATCH 019/131] perf pmu: Replace starts_with with strstarts

linux/string.h provides strstarts that matches the starts_with
function. For style and consistency reasons remove the starts_with
functions and use strstarts.

Signed-off-by: Ian Rogers <irogers@google.com>
Reviewed-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/arch/x86/util/pmu.c | 12 ++++--------
 tools/perf/util/drm_pmu.c      | 36 +++++++++++++++-------------------
 2 files changed, 20 insertions(+), 28 deletions(-)

diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c
index a3f96221758d..4ea4d022c9c3 100644
--- a/tools/perf/arch/x86/util/pmu.c
+++ b/tools/perf/arch/x86/util/pmu.c
@@ -5,6 +5,7 @@
 #include <dirent.h>
 #include <fcntl.h>
 #include <linux/stddef.h>
+#include <linux/string.h>
 #include <linux/perf_event.h>
 #include <linux/zalloc.h>
 #include <api/fs/fs.h>
@@ -71,11 +72,6 @@ static int snc_nodes_per_l3_cache(void)
 	return snc_nodes;
 }
 
-static bool starts_with(const char *str, const char *prefix)
-{
-	return !strncmp(prefix, str, strlen(prefix));
-}
-
 static int num_chas(void)
 {
 	static bool checked_chas;
@@ -93,7 +89,7 @@ static int num_chas(void)
 
 		while ((dent = io_dir__readdir(&dir)) != NULL) {
 			/* Note, dent->d_type will be DT_LNK and so isn't a useful filter. */
-			if (starts_with(dent->d_name, "uncore_cha_"))
+			if (strstarts(dent->d_name, "uncore_cha_"))
 				num_chas++;
 		}
 		close(fd);
@@ -305,9 +301,9 @@ void perf_pmu__arch_init(struct perf_pmu *pmu)
 			else
 				pmu->mem_events = perf_mem_events_intel;
 		} else if (x86__is_intel_graniterapids()) {
-			if (starts_with(pmu->name, "uncore_cha_"))
+			if (strstarts(pmu->name, "uncore_cha_"))
 				gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/true);
-			else if (starts_with(pmu->name, "uncore_imc_"))
+			else if (strstarts(pmu->name, "uncore_imc_"))
 				gnr_uncore_cha_imc_adjust_cpumask_for_snc(pmu, /*cha=*/false);
 		}
 	}
diff --git a/tools/perf/util/drm_pmu.c b/tools/perf/util/drm_pmu.c
index b48a375e4584..b8badae7015c 100644
--- a/tools/perf/util/drm_pmu.c
+++ b/tools/perf/util/drm_pmu.c
@@ -15,6 +15,7 @@
 #include <unistd.h>
 #include <linux/unistd.h>
 #include <linux/kcmp.h>
+#include <linux/string.h>
 #include <linux/zalloc.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
@@ -129,11 +130,6 @@ static struct drm_pmu *add_drm_pmu(struct list_head *pmus, char *line, size_t li
 }
 
 
-static bool starts_with(const char *str, const char *prefix)
-{
-	return !strncmp(prefix, str, strlen(prefix));
-}
-
 static int add_event(struct drm_pmu_event **events, int *num_events,
 		     const char *line, enum drm_pmu_unit unit, const char *desc)
 {
@@ -174,7 +170,7 @@ static int read_drm_pmus_cb(void *args, int fdinfo_dir_fd, const char *fd_name)
 	}
 
 	while (io__getline(&io, &line, &line_len) > 0) {
-		if (starts_with(line, "drm-driver:")) {
+		if (strstarts(line, "drm-driver:")) {
 			drm = add_drm_pmu(pmus, line, line_len);
 			if (!drm)
 				break;
@@ -184,59 +180,59 @@ static int read_drm_pmus_cb(void *args, int fdinfo_dir_fd, const char *fd_name)
 		 * Note the string matching below is alphabetical, with more
 		 * specific matches appearing before less specific.
 		 */
-		if (starts_with(line, "drm-active-")) {
+		if (strstarts(line, "drm-active-")) {
 			add_event(&events, &num_events, line, DRM_PMU_UNIT_BYTES,
 				  "Total memory active in one or more engines");
 			continue;
 		}
-		if (starts_with(line, "drm-cycles-")) {
+		if (strstarts(line, "drm-cycles-")) {
 			add_event(&events, &num_events, line, DRM_PMU_UNIT_CYCLES,
 				"Busy cycles");
 			continue;
 		}
-		if (starts_with(line, "drm-engine-capacity-")) {
+		if (strstarts(line, "drm-engine-capacity-")) {
 			add_event(&events, &num_events, line, DRM_PMU_UNIT_CAPACITY,
 				"Engine capacity");
 			continue;
 		}
-		if (starts_with(line, "drm-engine-")) {
+		if (strstarts(line, "drm-engine-")) {
 			add_event(&events, &num_events, line, DRM_PMU_UNIT_NS,
 				  "Utilization in ns");
 			continue;
 		}
-		if (starts_with(line, "drm-maxfreq-")) {
+		if (strstarts(line, "drm-maxfreq-")) {
 			add_event(&events, &num_events, line, DRM_PMU_UNIT_HZ,
 				  "Maximum frequency");
 			continue;
 		}
-		if (starts_with(line, "drm-purgeable-")) {
+		if (strstarts(line, "drm-purgeable-")) {
 			add_event(&events, &num_events, line, DRM_PMU_UNIT_BYTES,
 				  "Size of resident and purgeable memory buffers");
 			continue;
 		}
-		if (starts_with(line, "drm-resident-")) {
+		if (strstarts(line, "drm-resident-")) {
 			add_event(&events, &num_events, line, DRM_PMU_UNIT_BYTES,
 				  "Size of resident memory buffers");
 			continue;
 		}
-		if (starts_with(line, "drm-shared-")) {
+		if (strstarts(line, "drm-shared-")) {
 			add_event(&events, &num_events, line, DRM_PMU_UNIT_BYTES,
 				  "Size of shared memory buffers");
 			continue;
 		}
-		if (starts_with(line, "drm-total-cycles-")) {
+		if (strstarts(line, "drm-total-cycles-")) {
 			add_event(&events, &num_events, line, DRM_PMU_UNIT_BYTES,
 				  "Total busy cycles");
 			continue;
 		}
-		if (starts_with(line, "drm-total-")) {
+		if (strstarts(line, "drm-total-")) {
 			add_event(&events, &num_events, line, DRM_PMU_UNIT_BYTES,
 				  "Size of shared and private memory");
 			continue;
 		}
-		if (verbose > 1 && starts_with(line, "drm-") &&
-		    !starts_with(line, "drm-client-id:") &&
-		    !starts_with(line, "drm-pdev:"))
+		if (verbose > 1 && strstarts(line, "drm-") &&
+		    !strstarts(line, "drm-client-id:") &&
+		    !strstarts(line, "drm-pdev:"))
 			pr_debug("Unhandled DRM PMU fdinfo line match '%s'\n", line);
 	}
 	if (drm) {
@@ -261,7 +257,7 @@ bool drm_pmu__have_event(const struct perf_pmu *pmu, const char *name)
 {
 	struct drm_pmu *drm = container_of(pmu, struct drm_pmu, pmu);
 
-	if (!starts_with(name, "drm-"))
+	if (!strstarts(name, "drm-"))
 		return false;
 
 	for (int i = 0; i < drm->num_events; i++) {

From 6910944bf0b92fea63d5a7aeed69e4b9c14fd01b Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Mon, 2 Mar 2026 15:58:21 -0800
Subject: [PATCH 020/131] perf test type profiling: Remote typedef on struct

The typedef creates an issue where the struct or the typedef may
appear in the output and cause the "perf data type profiling tests" to
fail. Let's remove the typedef to keep the test passing.

Fixes: 335047109d7d ("perf tests: Test annotate with data type profiling and C")
Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/tests/shell/data_type_profiling.sh | 2 +-
 tools/perf/tests/workloads/datasym.c          | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/perf/tests/shell/data_type_profiling.sh b/tools/perf/tests/shell/data_type_profiling.sh
index 2a7f8f7c42d0..fb47b7213b33 100755
--- a/tools/perf/tests/shell/data_type_profiling.sh
+++ b/tools/perf/tests/shell/data_type_profiling.sh
@@ -8,7 +8,7 @@ set -e
 # data type profiling manifestation
 
 # Values in testtypes and testprogs should match
-testtypes=("# data-type: struct Buf" "# data-type: struct _buf")
+testtypes=("# data-type: struct Buf" "# data-type: struct buf")
 testprogs=("perf test -w code_with_type" "perf test -w datasym")
 
 err=0
diff --git a/tools/perf/tests/workloads/datasym.c b/tools/perf/tests/workloads/datasym.c
index 1d0b7d64e1ba..19242c7255c0 100644
--- a/tools/perf/tests/workloads/datasym.c
+++ b/tools/perf/tests/workloads/datasym.c
@@ -4,14 +4,14 @@
 #include <linux/compiler.h>
 #include "../tests.h"
 
-typedef struct _buf {
+struct buf {
 	char data1;
 	char reserved[55];
 	char data2;
-} buf __attribute__((aligned(64)));
+} __attribute__((aligned(64)));
 
 /* volatile to try to avoid the compiler seeing reserved as unused. */
-static volatile buf workload_datasym_buf1 = {
+static volatile struct buf workload_datasym_buf1 = {
 	/* to have this in the data section */
 	.reserved[0] = 1,
 };

From b1718b0367ba31e8db273e3896ebd1707bcbe59e Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <pcc@google.com>
Date: Tue, 3 Mar 2026 15:00:54 -0800
Subject: [PATCH 021/131] perf annotate: Specify llvm features="+all" for
 aarch64

This is consistent with what llvm-objdump does (see [1]) and allows
the LLVM disassembler to disassemble instructions not in the base
instruction set.

[1] https://reviews.llvm.org/D127741

Link: https://linux-review.googlesource.com/id/I52e4fef18d2e12b45f875231fa9d3efff2538fd4
Signed-off-by: Peter Collingbourne <pcc@google.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/llvm.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/llvm.c b/tools/perf/util/llvm.c
index 0d126d233c01..a0deb742a733 100644
--- a/tools/perf/util/llvm.c
+++ b/tools/perf/util/llvm.c
@@ -153,11 +153,17 @@ int symbol__disassemble_llvm(const char *filename, struct symbol *sym,
 					  /*get_op_info=*/NULL, symbol_lookup_callback);
 	} else {
 		char triplet[64];
+		const char *features = NULL;
 
 		scnprintf(triplet, sizeof(triplet), "%s-linux-gnu",
 			  args->arch->name);
-		disasm = LLVMCreateDisasm(triplet, &storage, /*tag_type=*/0,
-					  /*get_op_info=*/NULL, symbol_lookup_callback);
+		if (args->arch->id.e_machine == EM_AARCH64)
+			features = "+all";
+		disasm = LLVMCreateDisasmCPUFeatures(triplet, /*cpu=*/"",
+						     features, &storage,
+						     /*tag_type=*/0,
+						     /*get_op_info=*/NULL,
+						     symbol_lookup_callback);
 	}
 
 	if (disasm == NULL)

From 86ff690f45cc034ab32246630b3c7d7a46d1ae6b Mon Sep 17 00:00:00 2001
From: Besar Wicaksono <bwicaksono@nvidia.com>
Date: Thu, 12 Feb 2026 23:34:07 +0000
Subject: [PATCH 022/131] perf vendor events arm64: Add Tegra410 Olympus PMU
 events

Add JSON files for NVIDIA Tegra410 Olympus core PMU events.
Also updated the common-and-microarch.json.

Signed-off-by: Besar Wicaksono <bwicaksono@nvidia.com>
Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 .../arch/arm64/common-and-microarch.json      |  85 +++
 tools/perf/pmu-events/arch/arm64/mapfile.csv  |   1 +
 .../arch/arm64/nvidia/t410/branch.json        |  45 ++
 .../arch/arm64/nvidia/t410/brbe.json          |   6 +
 .../arch/arm64/nvidia/t410/bus.json           |  48 ++
 .../arch/arm64/nvidia/t410/exception.json     |  62 ++
 .../arch/arm64/nvidia/t410/fp_operation.json  |  78 ++
 .../arch/arm64/nvidia/t410/general.json       |  15 +
 .../arch/arm64/nvidia/t410/l1d_cache.json     | 122 +++
 .../arch/arm64/nvidia/t410/l1i_cache.json     | 114 +++
 .../arch/arm64/nvidia/t410/l2d_cache.json     | 134 ++++
 .../arch/arm64/nvidia/t410/ll_cache.json      | 107 +++
 .../arch/arm64/nvidia/t410/memory.json        |  46 ++
 .../arch/arm64/nvidia/t410/metrics.json       | 722 ++++++++++++++++++
 .../arch/arm64/nvidia/t410/misc.json          | 642 ++++++++++++++++
 .../arch/arm64/nvidia/t410/retired.json       |  94 +++
 .../arch/arm64/nvidia/t410/spe.json           |  42 +
 .../arm64/nvidia/t410/spec_operation.json     | 230 ++++++
 .../arch/arm64/nvidia/t410/stall.json         | 145 ++++
 .../arch/arm64/nvidia/t410/tlb.json           | 158 ++++
 20 files changed, 2896 insertions(+)
 create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/branch.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/brbe.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/bus.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/exception.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/fp_operation.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/general.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/l1d_cache.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/l1i_cache.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/l2d_cache.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/ll_cache.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/memory.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/metrics.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/misc.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/retired.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/spe.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/spec_operation.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/stall.json
 create mode 100644 tools/perf/pmu-events/arch/arm64/nvidia/t410/tlb.json

diff --git a/tools/perf/pmu-events/arch/arm64/common-and-microarch.json b/tools/perf/pmu-events/arch/arm64/common-and-microarch.json
index 468cb085d879..144325d87be4 100644
--- a/tools/perf/pmu-events/arch/arm64/common-and-microarch.json
+++ b/tools/perf/pmu-events/arch/arm64/common-and-microarch.json
@@ -1512,11 +1512,26 @@
         "EventName": "L2D_CACHE_REFILL_PRFM",
         "BriefDescription": "Level 2 data cache refill, software preload"
     },
+    {
+        "EventCode": "0x8150",
+        "EventName": "L3D_CACHE_RW",
+        "BriefDescription": "Level 3 data cache demand access."
+    },
+    {
+        "EventCode": "0x8151",
+        "EventName": "L3D_CACHE_PRFM",
+        "BriefDescription": "Level 3 data cache software prefetch"
+    },
     {
         "EventCode": "0x8152",
         "EventName": "L3D_CACHE_MISS",
         "BriefDescription": "Level 3 data cache demand access miss"
     },
+    {
+        "EventCode": "0x8153",
+        "EventName": "L3D_CACHE_REFILL_PRFM",
+        "BriefDescription": "Level 3 data cache refill, software prefetch."
+    },
     {
         "EventCode": "0x8154",
         "EventName": "L1D_CACHE_HWPRF",
@@ -1527,6 +1542,11 @@
         "EventName": "L2D_CACHE_HWPRF",
         "BriefDescription": "Level 2 data cache hardware prefetch."
     },
+    {
+        "EventCode": "0x8156",
+        "EventName": "L3D_CACHE_HWPRF",
+        "BriefDescription": "Level 3 data cache hardware prefetch."
+    },
     {
         "EventCode": "0x8158",
         "EventName": "STALL_FRONTEND_MEMBOUND",
@@ -1682,6 +1702,11 @@
         "EventName": "L2D_CACHE_REFILL_HWPRF",
         "BriefDescription": "Level 2 data cache refill, hardware prefetch."
     },
+    {
+        "EventCode": "0x81BE",
+        "EventName": "L3D_CACHE_REFILL_HWPRF",
+        "BriefDescription": "Level 3 data cache refill, hardware prefetch."
+    },
     {
         "EventCode": "0x81C0",
         "EventName": "L1I_CACHE_HIT_RD",
@@ -1712,11 +1737,31 @@
         "EventName": "L1I_CACHE_HIT_RD_FPRFM",
         "BriefDescription": "Level 1 instruction cache demand fetch first hit, fetched by software preload"
     },
+    {
+        "EventCode": "0x81DC",
+        "EventName": "L1D_CACHE_HIT_RW_FPRFM",
+        "BriefDescription": "Level 1 data cache demand access first hit, fetched by software prefetch."
+    },
     {
         "EventCode": "0x81E0",
         "EventName": "L1I_CACHE_HIT_RD_FHWPRF",
         "BriefDescription": "Level 1 instruction cache demand fetch first hit, fetched by hardware prefetcher"
     },
+    {
+        "EventCode": "0x81EC",
+        "EventName": "L1D_CACHE_HIT_RW_FHWPRF",
+        "BriefDescription": "Level 1 data cache demand access first hit, fetched by hardware prefetcher."
+    },
+    {
+        "EventCode": "0x81F0",
+        "EventName": "L1I_CACHE_HIT_RD_FPRF",
+        "BriefDescription": "Level 1 instruction cache demand fetch first hit, fetched by prefetch."
+    },
+    {
+        "EventCode": "0x81FC",
+        "EventName": "L1D_CACHE_HIT_RW_FPRF",
+        "BriefDescription": "Level 1 data cache demand access first hit, fetched by prefetch."
+    },
     {
         "EventCode": "0x8200",
         "EventName": "L1I_CACHE_HIT",
@@ -1767,11 +1812,26 @@
         "EventName": "L1I_LFB_HIT_RD_FPRFM",
         "BriefDescription": "Level 1 instruction cache demand fetch line-fill buffer first hit, recently fetched by software preload"
     },
+    {
+        "EventCode": "0x825C",
+        "EventName": "L1D_LFB_HIT_RW_FPRFM",
+        "BriefDescription": "Level 1 data cache demand access line-fill buffer first hit, recently fetched by software prefetch."
+    },
     {
         "EventCode": "0x8260",
         "EventName": "L1I_LFB_HIT_RD_FHWPRF",
         "BriefDescription": "Level 1 instruction cache demand fetch line-fill buffer first hit, recently fetched by hardware prefetcher"
     },
+    {
+        "EventCode": "0x826C",
+        "EventName": "L1D_LFB_HIT_RW_FHWPRF",
+        "BriefDescription": "Level 1 data cache demand access line-fill buffer first hit, recently fetched by hardware prefetcher."
+    },
+    {
+        "EventCode": "0x827C",
+        "EventName": "L1D_LFB_HIT_RW_FPRF",
+        "BriefDescription": "Level 1 data cache demand access line-fill buffer first hit, recently fetched by prefetch."
+    },
     {
         "EventCode": "0x8280",
         "EventName": "L1I_CACHE_PRF",
@@ -1807,6 +1867,11 @@
         "EventName": "LL_CACHE_REFILL",
         "BriefDescription": "Last level cache refill"
     },
+    {
+        "EventCode": "0x828E",
+        "EventName": "L3D_CACHE_REFILL_PRF",
+        "BriefDescription": "Level 3 data cache refill, prefetch."
+    },
     {
         "EventCode": "0x8320",
         "EventName": "L1D_CACHE_REFILL_PERCYC",
@@ -1872,6 +1937,16 @@
         "EventName": "FP_FP8_MIN_SPEC",
         "BriefDescription": "Floating-point operation speculatively_executed, smallest type is 8-bit floating-point."
     },
+    {
+        "EventCode": "0x8480",
+        "EventName": "FP_SP_FIXED_MIN_OPS_SPEC",
+        "BriefDescription": "Non-scalable element arithmetic operations speculatively executed, smallest type is single-precision floating-point."
+    },
+    {
+        "EventCode": "0x8482",
+        "EventName": "FP_HP_FIXED_MIN_OPS_SPEC",
+        "BriefDescription": "Non-scalable element arithmetic operations speculatively executed, smallest type is half-precision floating-point."
+    },
     {
         "EventCode": "0x8483",
         "EventName": "FP_BF16_FIXED_MIN_OPS_SPEC",
@@ -1882,6 +1957,16 @@
         "EventName": "FP_FP8_FIXED_MIN_OPS_SPEC",
         "BriefDescription": "Non-scalable element arithmetic operations speculatively executed, smallest type is 8-bit floating-point."
     },
+    {
+        "EventCode": "0x8488",
+        "EventName": "FP_SP_SCALE_MIN_OPS_SPEC",
+        "BriefDescription": "Scalable element arithmetic operations speculatively executed, smallest type is single-precision floating-point."
+    },
+    {
+        "EventCode": "0x848A",
+        "EventName": "FP_HP_SCALE_MIN_OPS_SPEC",
+        "BriefDescription": "Scalable element arithmetic operations speculatively executed, smallest type is half-precision floating-point."
+    },
     {
         "EventCode": "0x848B",
         "EventName": "FP_BF16_SCALE_MIN_OPS_SPEC",
diff --git a/tools/perf/pmu-events/arch/arm64/mapfile.csv b/tools/perf/pmu-events/arch/arm64/mapfile.csv
index bb3fa8a33496..7f0eaa702048 100644
--- a/tools/perf/pmu-events/arch/arm64/mapfile.csv
+++ b/tools/perf/pmu-events/arch/arm64/mapfile.csv
@@ -46,3 +46,4 @@
 0x00000000500f0000,v1,ampere/emag,core
 0x00000000c00fac30,v1,ampere/ampereone,core
 0x00000000c00fac40,v1,ampere/ampereonex,core
+0x000000004e0f0100,v1,nvidia/t410,core
diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/branch.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/branch.json
new file mode 100644
index 000000000000..ef4effc00ec3
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/branch.json
@@ -0,0 +1,45 @@
+[
+    {
+        "ArchStdEvent": "BR_MIS_PRED",
+        "PublicDescription": "This event counts branches which are speculatively executed and mispredicted."
+    },
+    {
+        "ArchStdEvent": "BR_PRED",
+        "PublicDescription": "This event counts all speculatively executed branches."
+    },
+    {
+        "EventCode": "0x017e",
+        "EventName": "BR_PRED_BTB_CTX_UPDATE",
+        "PublicDescription": "Branch context table update."
+    },
+    {
+        "EventCode": "0x0188",
+        "EventName": "BR_MIS_PRED_DIR_RESOLVED",
+        "PublicDescription": "Number of branch misprediction due to direction misprediction."
+    },
+    {
+        "EventCode": "0x0189",
+        "EventName": "BR_MIS_PRED_DIR_UNCOND_RESOLVED",
+        "PublicDescription": "Number of branch misprediction due to direction misprediction for unconditional branches."
+    },
+    {
+        "EventCode": "0x018a",
+        "EventName": "BR_MIS_PRED_DIR_UNCOND_DIRECT_RESOLVED",
+        "PublicDescription": "Number of branch misprediction due to direction misprediction for unconditional direct branches."
+    },
+    {
+        "EventCode": "0x018b",
+        "EventName": "BR_PRED_MULTI_RESOLVED",
+        "PublicDescription": "Number of resolved branch which made prediction by polymorphic indirect predictor."
+    },
+    {
+        "EventCode": "0x018c",
+        "EventName": "BR_MIS_PRED_MULTI_RESOLVED",
+        "PublicDescription": "Number of branch misprediction which made prediction by polymorphic indirect predictor."
+    },
+    {
+        "EventCode": "0x01e4",
+        "EventName": "BR_RGN_RECLAIM",
+        "PublicDescription": "This event counts the Indirect predictor entries flushed by region reclamation."
+    }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/brbe.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/brbe.json
new file mode 100644
index 000000000000..9c315b2d7046
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/brbe.json
@@ -0,0 +1,6 @@
+[
+    {
+        "ArchStdEvent": "BRB_FILTRATE",
+        "PublicDescription": "This event counts each valid branch record captured in the branch record buffer. Branch records that are not captured because they are removed by filtering are not counted."
+    }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/bus.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/bus.json
new file mode 100644
index 000000000000..5bb8de617c68
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/bus.json
@@ -0,0 +1,48 @@
+[
+    {
+        "ArchStdEvent": "BUS_ACCESS",
+        "PublicDescription": "This event counts the number of data-beat accesses between the CPU and the external bus. This count includes accesses due to read, write, and snoop. Each beat of data is counted individually."
+    },
+    {
+        "ArchStdEvent": "BUS_CYCLES",
+        "PublicDescription": "This event counts bus cycles in the CPU. Bus cycles represent a clock cycle in which a transaction could be sent or received on the interface from the CPU to the external bus. Since that interface is driven at the same clock speed as the CPU, this event increments at the rate of CPU clock. Regardless of the WFE/WFI state of the PE, this event increments on each processor clock."
+    },
+    {
+        "ArchStdEvent": "BUS_ACCESS_RD",
+        "PublicDescription": "This event counts memory Read transactions seen on the external bus. Each beat of data is counted individually."
+    },
+    {
+        "ArchStdEvent": "BUS_ACCESS_WR",
+        "PublicDescription": "This event counts memory Write transactions seen on the external bus. Each beat of data is counted individually."
+    },
+    {
+        "EventCode": "0x0154",
+        "EventName": "BUS_REQUEST_REQ",
+        "PublicDescription": "Bus request, request."
+    },
+    {
+        "EventCode": "0x0155",
+        "EventName": "BUS_REQUEST_RETRY",
+        "PublicDescription": "Bus request, retry."
+    },
+    {
+        "EventCode": "0x0198",
+        "EventName": "L2_CHI_CBUSY0",
+        "PublicDescription": "Number of RXDAT or RXRSP response received width CBusy of 0."
+    },
+    {
+        "EventCode": "0x0199",
+        "EventName": "L2_CHI_CBUSY1",
+        "PublicDescription": "Number of RXDAT or RXRSP response received width CBusy of 1."
+    },
+    {
+        "EventCode": "0x019a",
+        "EventName": "L2_CHI_CBUSY2",
+        "PublicDescription": "Number of RXDAT or RXRSP response received width CBusy of 2."
+    },
+    {
+        "EventCode": "0x019b",
+        "EventName": "L2_CHI_CBUSY3",
+        "PublicDescription": "Number of RXDAT or RXRSP response received width CBusy of 3."
+    }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/exception.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/exception.json
new file mode 100644
index 000000000000..ecd996c3610b
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/exception.json
@@ -0,0 +1,62 @@
+[
+    {
+        "ArchStdEvent": "EXC_TAKEN",
+        "PublicDescription": "This event counts any taken architecturally visible exceptions such as IRQ, FIQ, SError, and other synchronous exceptions. Exceptions are counted whether or not they are taken locally."
+    },
+    {
+        "ArchStdEvent": "EXC_RETURN",
+        "PublicDescription": "This event counts any architecturally executed exception return instructions. For example: AArch64: ERET."
+    },
+    {
+        "ArchStdEvent": "EXC_UNDEF",
+        "PublicDescription": "This event counts the number of synchronous exceptions which are taken locally that are due to attempting to execute an instruction that is UNDEFINED.\nAttempting to execute instruction bit patterns that have not been allocated.\nAttempting to execute instructions when they are disabled.\nAttempting to execute instructions at an inappropriate Exception level.\nAttempting to execute an instruction when the value of PSTATE.IL is 1."
+    },
+    {
+        "ArchStdEvent": "EXC_SVC",
+        "PublicDescription": "This event counts SVC exceptions taken locally."
+    },
+    {
+        "ArchStdEvent": "EXC_PABORT",
+        "PublicDescription": "This event counts synchronous exceptions that are taken locally and caused by Instruction Aborts."
+    },
+    {
+        "ArchStdEvent": "EXC_DABORT",
+        "PublicDescription": "This event counts exceptions that are taken locally and are caused by data aborts or SErrors. Conditions that could cause those exceptions are attempting to read or write memory where the MMU generates a fault, attempting to read or write memory with a misaligned address, Interrupts from the nSEI inputs and internally generated SErrors."
+    },
+    {
+        "ArchStdEvent": "EXC_IRQ",
+        "PublicDescription": "This event counts IRQ exceptions including the virtual IRQs that are taken locally."
+    },
+    {
+        "ArchStdEvent": "EXC_FIQ",
+        "PublicDescription": "This event counts FIQ exceptions including the virtual FIQs that are taken locally."
+    },
+    {
+        "ArchStdEvent": "EXC_SMC",
+        "PublicDescription": "This event counts SMC exceptions taken to EL3."
+    },
+    {
+        "ArchStdEvent": "EXC_HVC",
+        "PublicDescription": "This event counts HVC exceptions taken to EL2."
+    },
+    {
+        "ArchStdEvent": "EXC_TRAP_PABORT",
+        "PublicDescription": "This event counts exceptions which are traps not taken locally and are caused by Instruction Aborts. For example, attempting to execute an instruction with a misaligned PC."
+    },
+    {
+        "ArchStdEvent": "EXC_TRAP_DABORT",
+        "PublicDescription": "This event counts exceptions which are traps not taken locally and are caused by Data Aborts or SError Interrupts. Conditions that could cause those exceptions are:\n* Attempting to read or write memory where the MMU generates a fault,\n* Attempting to read or write memory with a misaligned address,\n* Interrupts from the SEI input,\n* Internally generated SErrors."
+    },
+    {
+        "ArchStdEvent": "EXC_TRAP_OTHER",
+        "PublicDescription": "This event counts the number of synchronous trap exceptions which are not taken locally and are not SVC, SMC, HVC, Data Aborts, Instruction Aborts, or Interrupts."
+    },
+    {
+        "ArchStdEvent": "EXC_TRAP_IRQ",
+        "PublicDescription": "This event counts IRQ exceptions including the virtual IRQs that are not taken locally."
+    },
+    {
+        "ArchStdEvent": "EXC_TRAP_FIQ",
+        "PublicDescription": "This event counts FIQs which are not taken locally but taken from EL0, EL1, or EL2 to EL3 (which would be the normal behavior for FIQs when not executing in EL3)."
+    }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/fp_operation.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/fp_operation.json
new file mode 100644
index 000000000000..3588e130781d
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/fp_operation.json
@@ -0,0 +1,78 @@
+[
+    {
+        "ArchStdEvent": "FP_HP_SPEC",
+        "PublicDescription": "This event counts speculatively executed half precision floating point operations."
+    },
+    {
+        "ArchStdEvent": "FP_SP_SPEC",
+        "PublicDescription": "This event counts speculatively executed single precision floating point operations."
+    },
+    {
+        "ArchStdEvent": "FP_DP_SPEC",
+        "PublicDescription": "This event counts speculatively executed double precision floating point operations."
+    },
+    {
+        "ArchStdEvent": "FP_SCALE_OPS_SPEC",
+        "PublicDescription": "This event counts speculatively executed scalable single precision floating point operations."
+    },
+    {
+        "ArchStdEvent": "FP_FIXED_OPS_SPEC",
+        "PublicDescription": "This event counts speculatively executed non-scalable single precision floating point operations."
+    },
+    {
+        "ArchStdEvent": "FP_HP_SCALE_OPS_SPEC",
+        "PublicDescription": "This event increments by v for each speculatively executed scalable element arithmetic operation, due to an instruction where the largest type was half-precision floating-point, where v is a value such that (v*(VL/128)) is the number of arithmetic operations carried out by the operation or instruction which causes the counter to increment.\nThis event does not count operations that are counted by FP_FIXED_OPS_SPEC or FP_SCALE2_OPS_SPEC."
+    },
+    {
+        "ArchStdEvent": "FP_HP_FIXED_OPS_SPEC",
+        "PublicDescription": "This event increments by v for each speculatively executed non-scalable element arithmetic operation, due to an instruction where the largest type was half-precision floating-point, where v is the number of arithmetic operations carried out by the operation or which instruction causes the event to increment.\nThis event does not count operations that are counted by FP_SCALE_OPS_SPEC or FP_SCALE2_OPS_SPEC."
+    },
+    {
+        "ArchStdEvent": "FP_SP_SCALE_OPS_SPEC",
+        "PublicDescription": "This event increments by v for each speculatively executed scalable element arithmetic operation, due to an instruction where the largest type was single-precision floating-point, where v is a value such that (v*(VL/128)) is the number of arithmetic operations carried out by the operation or instruction which causes the event to increment.\nThis event does not count operations that are counted by FP_FIXED_OPS_SPEC or FP_SCALE2_OPS_SPEC."
+    },
+    {
+        "ArchStdEvent": "FP_SP_FIXED_OPS_SPEC",
+        "PublicDescription": "This event increments by v for each speculatively executed non-scalable element arithmetic operation, due to an instruction where the largest type was single-precision floating-point, where v is the number of arithmetic operations carried out by the operation or instruction which causes the event to increment.\nThis event does not count operations that are counted by FP_SCALE_OPS_SPEC or FP_SCALE2_OPS_SPEC."
+    },
+    {
+        "ArchStdEvent": "FP_DP_SCALE_OPS_SPEC",
+        "PublicDescription": "This event increments by v for each speculatively executed scalable element arithmetic operation, due to an instruction where the largest type was double-precision floating-point, where v is a value such that (v*(VL/128)) is the number of arithmetic operations carried out by the operation or instruction which causes the event to increment.\nThis event does not count operations that are counted by FP_FIXED_OPS_SPEC or FP_SCALE2_OPS_SPEC."
+    },
+    {
+        "ArchStdEvent": "FP_DP_FIXED_OPS_SPEC",
+        "PublicDescription": "This event increments by v for each speculatively executed non-scalable element arithmetic operation, due to an instruction where the largest type was double-precision floating-point, where v is the number of arithmetic operations carried out by the operation or instruction which causes the event to increment.\nThis event does not count operations that are counted by FP_SCALE_OPS_SPEC or FP_SCALE2_OPS_SPEC."
+    },
+    {
+        "ArchStdEvent": "FP_SP_FIXED_MIN_OPS_SPEC",
+        "PublicDescription": "This event increments by v for each speculatively executed non-scalable element arithmetic operation, due to an instruction where the smallest type was single-precision floating-point, where v is the number of arithmetic operations carried out by the operation or instruction which causes the event to increment.\nThis event does not count operations that are counted by FP_SCALE_OPS_SPEC or FP_SCALE2_OPS_SPEC."
+    },
+    {
+        "ArchStdEvent": "FP_HP_FIXED_MIN_OPS_SPEC",
+        "PublicDescription": "This event increments by v for each speculatively executed non-scalable element arithmetic operation, due to an instruction where the smallest type was half-precision floating-point, where v is the number of arithmetic operations carried out by the operation or instruction which causes the event to increment.\nThis event does not count operations that are counted by FP_SCALE_OPS_SPEC or FP_SCALE2_OPS_SPEC."
+    },
+    {
+        "ArchStdEvent": "FP_BF16_FIXED_MIN_OPS_SPEC",
+        "PublicDescription": "This event increments by v for each speculatively executed non-scalable element arithmetic operation, due to an instruction where the smallest type was BFloat16 floating-point. Where v is the number of arithmetic operations carried out by the operation or instruction which causes the event to increment. This event does not count operations that are counted by FP_SCALE_OPS_SPEC or FP_SCALE2_OPS_SPEC."
+    },
+    {
+        "ArchStdEvent": "FP_FP8_FIXED_MIN_OPS_SPEC",
+        "PublicDescription": "This event increments by v for each speculatively executed non-scalable element arithmetic operation, due to an instruction where the smallest type was 8-bit floating-point, where v is the number of arithmetic operations carried out by the operation or instruction which causes the event to increment.\nThis event does not count operations that are counted by FP_SCALE_OPS_SPEC or FP_SCALE2_OPS_SPEC."
+    },
+    {
+        "ArchStdEvent": "FP_SP_SCALE_MIN_OPS_SPEC",
+        "PublicDescription": "This event increments by v for each speculatively executed scalable element arithmetic operation, due to an instruction where the smallest type was single-precision floating-point, where v is a value such that (v*(VL/128)) is the number of arithmetic operations carried out by the operation or instruction which causes the event to increment.\nThis event does not count operations that are counted by FP_FIXED_OPS_SPEC or FP_SCALE2_OPS_SPEC."
+    },
+    {
+        "ArchStdEvent": "FP_HP_SCALE_MIN_OPS_SPEC",
+        "PublicDescription": "This event increments by v for each speculatively executed scalable element arithmetic operation, due to an instruction where the smallest type was half-precision floating-point, where v is a value such that (v*(VL/128)) is the number of arithmetic operations carried out by the operation or instruction which causes the event to increment.\nThis event does not count operations that are counted by FP_FIXED_OPS_SPEC or FP_SCALE2_OPS_SPEC."
+    },
+    {
+        "ArchStdEvent": "FP_BF16_SCALE_MIN_OPS_SPEC",
+        "PublicDescription": "This event increments by v for each speculatively executed scalable element arithmetic operation, due to an instruction where the smallest type was BFloat16 floating-point, where v is a value such that (v*(VL/128)) is the number of arithmetic operations carried out by the operation or instruction which causes the event to increment.\nThis event does not count operations that are counted by FP_FIXED_OPS_SPEC or FP_SCALE2_OPS_SPEC."
+    },
+    {
+        "ArchStdEvent": "FP_FP8_SCALE_MIN_OPS_SPEC",
+        "PublicDescription": "This event increments by v for each speculatively executed scalable element arithmetic operation, due to an instruction where the smallest type was 8-bit floating-point, where v is a value such that (v*(VL/128)) is the number of arithmetic operations carried out by the operation or instruction which causes the event to increment.\nThis event does not count operations that are counted by FP_FIXED_OPS_SPEC or FP_SCALE2_OPS_SPEC."
+    }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/general.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/general.json
new file mode 100644
index 000000000000..bd9c248387aa
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/general.json
@@ -0,0 +1,15 @@
+[
+    {
+        "ArchStdEvent": "CPU_CYCLES",
+        "PublicDescription": "This event counts CPU clock cycles when the PE is not in WFE/WFI. The clock measured by this event is defined as the physical clock driving the CPU logic."
+    },
+    {
+        "ArchStdEvent": "CNT_CYCLES",
+        "PublicDescription": "This event increments at a constant frequency equal to the rate of increment of the System Counter, CNTPCT_EL0.\nThis event does not increment when the PE is in WFE/WFI."
+    },
+    {
+        "EventCode": "0x01e1",
+        "EventName": "CPU_SLOT",
+        "PublicDescription": "Entitled CPU slots.\nThis event counts the number of slots. When in ST mode, this event shall increment by PMMIR_EL1.SLOTS quantities, and when in SMT partitioned resource mode (regardless of in WFI state or otherwise), this event is incremented by PMMIR_EL1.SLOTS/2 quantities."
+    }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/l1d_cache.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/l1d_cache.json
new file mode 100644
index 000000000000..ed6f764eff24
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/l1d_cache.json
@@ -0,0 +1,122 @@
+[
+    {
+        "ArchStdEvent": "L1D_CACHE_REFILL",
+        "PublicDescription": "This event counts L1 D-cache refills caused by speculatively executed load or store operations, preload instructions, or hardware cache prefetching that missed in the L1 D-cache. This event only counts one event per cache line.\nSince the caches are Write-back only for this processor, there are no Write-through cache accesses."
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE",
+        "PublicDescription": "This event counts L1 D-cache accesses from any load/store operations, software preload, or hardware prefetch operations. Atomic operations that resolve in the CPU's caches (near atomic operations) count as both a write access and read access. Each access to a cache line is counted including the multiple accesses caused by single instructions such as LDM or STM. Each access to other L1 data or unified memory structures, for example refill buffers, write buffers, and write-back buffers, are also counted.\nThis event counts the sum of the following events:\nL1D_CACHE_RD,\nL1D_CACHE_WR,\nL1D_CACHE_PRFM, and\nL1D_CACHE_HWPRF."
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_WB",
+        "PublicDescription": "This event counts write-backs of dirty data from the L1 D-cache to the L2 cache. This occurs when either a dirty cache line is evicted from L1 D-cache and allocated in the L2 cache or dirty data is written to the L2 and possibly to the next level of cache. This event counts both victim cache line evictions and cache write-backs from snoops or cache maintenance operations. The following cache operations are not counted:\n* Invalidations which do not result in data being transferred out of the L1 (such as evictions of clean data),\n* Full line writes which write to L2 without writing L1, such as write streaming mode.\nThis event is the sum of the following events:\nL1D_CACHE_WB_CLEAN and\nL1D_CACHE_WB_VICTIM."
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_LMISS_RD",
+        "PublicDescription": "This event counts cache line refills into the L1 D-cache from any memory Read operations, that incurred additional latency.\nCounts same as L1D_CACHE_REFILL_RD on this CPU."
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_RD",
+        "PublicDescription": "This event counts L1 D-cache accesses from any Load operation. Atomic Load operations that resolve in the CPU's caches count as both a write access and read access."
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_WR",
+        "PublicDescription": "This event counts L1 D-cache accesses generated by Store operations. This event also counts accesses caused by a DC ZVA (D-cache zero, specified by virtual address) instruction. Near atomic operations that resolve in the CPU's caches count as a write access and read access.\nThis event is a subset of the L1D_CACHE event, except this event only counts memory Write operations."
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_REFILL_RD",
+        "PublicDescription": "This event counts L1 D-cache refills caused by speculatively executed Load instructions where the memory Read operation misses in the L1 D-cache. This event only counts one event per cache line.\nThis event is a subset of the L1D_CACHE_REFILL event, but only counts memory Read operations. This event does not count reads caused by cache maintenance operations or preload instructions."
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_REFILL_WR",
+        "PublicDescription": "This event counts L1 D-cache refills caused by speculatively executed Store instructions where the memory Write operation misses in the L1 D-cache. This event only counts one event per cache line.\nThis event is a subset of the L1D_CACHE_REFILL event, but only counts memory Write operations."
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_REFILL_INNER",
+        "PublicDescription": "This event counts L1 D-cache refills (L1D_CACHE_REFILL) where the cache line data came from caches inside the immediate Cluster of the Core (L2 cache)."
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_REFILL_OUTER",
+        "PublicDescription": "This event counts L1 D-cache refills (L1D_CACHE_REFILL) for which the cache line data came from outside the immediate Cluster of the Core, like an SLC in the system interconnect or DRAM or remote socket."
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_WB_VICTIM",
+        "PublicDescription": "This event counts dirty cache line evictions from the L1 D-cache caused by a new cache line allocation. This event does not count evictions caused by cache maintenance operations.\nThis event is a subset of the L1D_CACHE_WB event, but only counts write-backs that are a result of the line being allocated for an access made by the CPU."
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_WB_CLEAN",
+        "PublicDescription": "This event counts write-backs from the L1 D-cache that are a result of a coherency operation made by another CPU. Event counts include cache maintenance operations.\nThis event is a subset of the L1D_CACHE_WB event."
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_INVAL",
+        "PublicDescription": "This event counts each explicit invalidation of a cache line in the L1 D-cache caused by:\n* Cache Maintenance Operations (CMO) that operate by a virtual address.\n* Broadcast cache coherency operations from another CPU in the system.\nThis event does not count for the following conditions:\n* A cache refill invalidates a cache line.\n* A CMO which is executed on that CPU and invalidates a cache line specified by Set/Way.\nNote that CMOs that operate by Set/Way cannot be broadcast from one CPU to another."
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_RW",
+        "PublicDescription": "This event counts L1 data demand cache accesses from any Load or Store operation. Near atomic operations that resolve in the CPU's caches count as both a write access and read access.\nThis event is implemented as L1D_CACHE_RD + L1D_CACHE_WR"
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_PRFM",
+        "PublicDescription": "This event counts L1 D-cache accesses from software preload or prefetch instructions."
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_MISS",
+        "PublicDescription": "This event counts each demand access counted by L1D_CACHE_RW that misses in the L1 Data or unified cache, causing an access to outside of the L1 caches of this PE."
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_REFILL_PRFM",
+        "PublicDescription": "This event counts L1 D-cache refills where the cache line access was generated by software preload or prefetch instructions."
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_HWPRF",
+        "PublicDescription": "This event counts L1 D-cache accesses from any Load/Store operations generated by the hardware prefetcher."
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_REFILL_HWPRF",
+        "PublicDescription": "This event counts each hardware prefetch access counted by L1D_CACHE_HWPRF that causes a refill of the L1 D-cache from outside of the L1 D-cache."
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_HIT_RW_FPRFM",
+        "PublicDescription": "This event counts each demand access first hit counted by L1D_CACHE_HIT_RW_FPRF where the cache line was fetched in response to a prefetch instruction. That is, the L1D_CACHE_REFILL_PRFM event was generated when the cache line was fetched into the cache.\nOnly the first hit by a demand access is counted. After this event is generated for a cache line, the event is not generated again for the same cache line while it remains in the cache."
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_HIT_RW_FHWPRF",
+        "PublicDescription": "This event counts each demand access first hit counted by L1D_CACHE_HIT_RW_FPRF where the cache line was fetched by a hardware prefetcher. That is, the L1D_CACHE_REFILL_HWPRF Event was generated when the cache line was fetched into the cache.\nOnly the first hit by a demand access is counted. After this event is generated for a cache line, the event is not generated again for the same cache line while it remains in the cache."
+    },
+    {
+        "ArchStdEvent": "L1D_CACHE_HIT_RW_FPRF",
+        "PublicDescription": "This event counts each demand access first hit counted by L1D_CACHE_HIT_RW where the cache line was fetched in response to a prefetch instruction or by a hardware prefetcher. That is, the L1D_CACHE_REFILL_PRF event was generated when the cache line was fetched into the cache.\nOnly the first hit by a demand access is counted. After this event is generated for a cache line, the event is not generated again for the same cache line while it remains in the cache."
+    },
+    {
+        "ArchStdEvent": "L1D_LFB_HIT_RW_FPRFM",
+        "PublicDescription": "This event counts each demand access line-fill buffer first hit counted by L1D_LFB_HIT_RW_FPRF where the cache line was fetched in response to a prefetch instruction. That is, the access hits a cache line that is in the process of being loaded into the L1 D-cache, and so does not generate a new refill, but has to wait for the previous refill to complete, and the L1D_CACHE_REFILL_PRFM event was generated when the cache line was fetched into the cache.\nOnly the first hit by a demand access is counted. After this event is generated for a cache line, the event is not generated again for the same cache line while it remains in the cache."
+    },
+    {
+        "ArchStdEvent": "L1D_LFB_HIT_RW_FHWPRF",
+        "PublicDescription": "This event counts each demand access line-fill buffer first hit counted by L1D_LFB_HIT_RW_FPRF, where the cache line was fetched by a hardware prefetcher. That is, the access hits a cache line that is in the process of being loaded into the L1 D-cache, and so does not generate a new refill, but has to wait for the previous refill to complete, and the L1D_CACHE_REFILL_HWPRF Event was generated when the cache line was fetched into the cache.\nOnly the first hit by a demand access is counted. After this event is generated for a cache line, the event is not generated again for the same cache line while it remains in the cache."
+    },
+    {
+        "ArchStdEvent": "L1D_LFB_HIT_RW_FPRF",
+        "PublicDescription": "This event counts each demand access line-fill buffer first hit counted by L1D_LFB_HIT_RW where the cache line was fetched in response to a prefetch instruction or by a hardware prefetcher. That is, the access hits a cache line that is in the process of being loaded into the L1 D-cache, and so does not generate a new refill, but has to wait for the previous refill to complete, and the L1D_CACHE_REFILL_PRF event was generated when the cache line was fetched into the cache.\nOnly the first hit by a demand access is counted. After this event is generated for a cache line, the event is not generated again for the same cache line while it remains in the cache."
+    },
+    {
+        "EventCode": "0x01f5",
+        "EventName": "L1D_CACHE_REFILL_RW",
+        "PublicDescription": "L1 D-cache refill, demand Read and Write. This event counts demand Read and Write accesses that causes a refill of the L1 D-cache of this PE, from outside of this cache."
+    },
+    {
+        "EventCode": "0x0204",
+        "EventName": "L1D_CACHE_REFILL_OUTER_LLC",
+        "PublicDescription": "This event counts L1D_CACHE_REFILL from L3 D-cache."
+    },
+    {
+        "EventCode": "0x0205",
+        "EventName": "L1D_CACHE_REFILL_OUTER_DRAM",
+        "PublicDescription": "This event counts L1D_CACHE_REFILL from local memory."
+    },
+    {
+        "EventCode": "0x0206",
+        "EventName": "L1D_CACHE_REFILL_OUTER_REMOTE",
+        "PublicDescription": "This event counts L1D_CACHE_REFILL from a remote memory."
+    }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/l1i_cache.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/l1i_cache.json
new file mode 100644
index 000000000000..952454004d98
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/l1i_cache.json
@@ -0,0 +1,114 @@
+[
+    {
+        "ArchStdEvent": "L1I_CACHE_REFILL",
+        "PublicDescription": "This event counts cache line refills in the L1 I-cache caused by a missed instruction fetch (demand, hardware prefetch, and software preload accesses). Instruction fetches may include accessing multiple instructions, but the single cache line allocation is counted once."
+    },
+    {
+        "ArchStdEvent": "L1I_CACHE",
+        "PublicDescription": "This event counts instruction fetches (demand, hardware prefetch, and software preload accesses) which access the L1 Instruction Cache. Instruction Cache accesses caused by cache maintenance operations are not counted."
+    },
+    {
+        "ArchStdEvent": "L1I_CACHE_LMISS",
+        "PublicDescription": "This event counts cache line refills into the L1 I-cache, that incurred additional latency.\nCounts the same as L1I_CACHE_REFILL in this CPU."
+    },
+    {
+        "ArchStdEvent": "L1I_CACHE_RD",
+        "PublicDescription": "This event counts demand instruction fetches which access the L1 I-cache."
+    },
+    {
+        "ArchStdEvent": "L1I_CACHE_PRFM",
+        "PublicDescription": "This event counts instruction fetches generated by software preload or prefetch instructions which access the L1 I-cache."
+    },
+    {
+        "ArchStdEvent": "L1I_CACHE_HWPRF",
+        "PublicDescription": "This event counts instruction fetches which access the L1 I-cache generated by the hardware prefetcher."
+    },
+    {
+        "ArchStdEvent": "L1I_CACHE_REFILL_PRFM",
+        "PublicDescription": "This event counts cache line refills in the L1 I-cache caused by a missed instruction fetch generated by software preload or prefetch instructions. Instruction fetches may include accessing multiple instructions, but the single cache line allocation is counted once."
+    },
+    {
+        "ArchStdEvent": "L1I_CACHE_REFILL_HWPRF",
+        "PublicDescription": "This event counts each hardware prefetch access counted by L1I_CACHE_HWPRF that causes a refill of the Level 1I-cache from outside of the L1 I-cache."
+    },
+    {
+        "ArchStdEvent": "L1I_CACHE_HIT_RD",
+        "PublicDescription": "This event counts demand instruction fetches that access the L1 I-cache and hit in the L1 I-cache."
+    },
+    {
+        "ArchStdEvent": "L1I_CACHE_HIT_RD_FPRF",
+        "PublicDescription": "This event counts each demand fetch first hit counted by L1I_CACHE_HIT_RD where the cache line was fetched in response to a software preload or by a hardware prefetcher. That is, the L1I_CACHE_REFILL_PRF event was generated when the cache line was fetched into the cache.\nOnly the first hit by a demand access is counted. After this event is generated for a cache line, the event is not generated again for the same cache line while it remains in the cache."
+    },
+    {
+        "ArchStdEvent": "L1I_CACHE_HIT",
+        "PublicDescription": "This event counts instruction fetches that access the L1 I-cache (demand, hardware prefetch, and software preload accesses) and hit in the L1 I-cache. I-cache accesses caused by cache maintenance operations are not counted."
+    },
+    {
+        "ArchStdEvent": "L1I_CACHE_HIT_PRFM",
+        "PublicDescription": "This event counts instruction fetches generated by software preload or prefetch instructions that access the L1 I-cache and hit in the L1 I-cache."
+    },
+    {
+        "ArchStdEvent": "L1I_LFB_HIT_RD",
+        "PublicDescription": "This event counts demand instruction fetches that access the L1 I-cache and hit in a line that is in the process of being loaded into the L1 I-cache."
+    },
+    {
+        "EventCode": "0x0174",
+        "EventName": "L1I_HWPRF_REQ_DROP",
+        "PublicDescription": "L1 I-cache hardware prefetch dropped."
+    },
+    {
+        "EventCode": "0x01e3",
+        "EventName": "L1I_CACHE_REFILL_RD",
+        "PublicDescription": "L1 I-cache refill, Read.\nThis event counts demand instruction fetch that causes a refill of the L1 I-cache of this PE, from outside of this cache."
+    },
+    {
+        "EventCode": "0x01ea",
+        "EventName": "L1I_CFC_ENTRIES",
+        "PublicDescription": "This event counts the CFC (Cache Fill Control) entries.\nThe CFC is the fill buffer for I-cache."
+    },
+    {
+        "EventCode": "0x01ef",
+        "EventName": "L1I_CACHE_INVAL",
+        "PublicDescription": "L1 I-cache invalidate.\nThis event counts each explicit invalidation of a cache line in the L1 I-cache caused by:\n* Broadcast cache coherency operations from another CPU in the system.\n* Invalidation dues to capacity eviction in L2 D-cache.\nThis event does not count for the following conditions:\n* A cache refill invalidates a cache line.\n* A CMO which is executed on that CPU Core and invalidates a cache line specified by Set/Way.\n* Cache Maintenance Operations (CMO) that operate by a virtual address.\nNote that\n* CMOs that operate by Set/Way cannot be broadcast from one CPU Core to another.\n* The CMO is treated as No-op for the purposes of L1 I-cache line invalidation, as this Core implements fully coherent I-cache."
+    },
+    {
+        "EventCode": "0x0212",
+        "EventName": "L1I_CACHE_HIT_HWPRF",
+        "PublicDescription": "This event counts each hardware prefetch access that hits an L1 I-cache."
+    },
+    {
+        "EventCode": "0x0215",
+        "EventName": "L1I_LFB_HIT",
+        "PublicDescription": "L1 Line fill buffer hit.\nThis event counts each Demand or software preload or hardware prefetch induced instruction fetch that hits an L1 I-cache line that is in the process of being loaded into the L1 instruction cache, and so does not generate a new refill, but has to wait for the previous refill to complete."
+    },
+    {
+        "EventCode": "0x0216",
+        "EventName": "L1I_LFB_HIT_PRFM",
+        "PublicDescription": "This event counts each software prefetch access that hits a cache line that is in the process of being loaded into the L1 instruction cache, and so does not generate a new refill, but has to wait for the previous refill to complete."
+    },
+    {
+        "EventCode": "0x0219",
+        "EventName": "L1I_LFB_HIT_HWPRF",
+        "PublicDescription": "This event counts each hardware prefetch access that hits a cache line that is in the process of being loaded into the L1 instruction cache, and so does not generate a new refill, but has to wait for the previous refill to complete."
+    },
+    {
+        "EventCode": "0x0221",
+        "EventName": "L1I_PRFM_REQ",
+        "PublicDescription": "L1 I-cache software prefetch requests."
+    },
+    {
+        "EventCode": "0x0222",
+        "EventName": "L1I_HWPRF_REQ",
+        "PublicDescription": "L1 I-cache hardware prefetch requests."
+    },
+    {
+        "EventCode": "0x0228",
+        "EventName": "L1I_CACHE_HIT_PRFM_FPRF",
+        "PublicDescription": "L1 I-cache software prefetch access first hit, fetched by hardware or software prefetch.\nThis event counts each software preload access first hit where the cache line was fetched in response to a hardware prefetcher or software preload instruction.\nOnly the first hit is counted. After this event is generated for a cache line, the event is not generated again for the same cache line while it remains in the cache."
+    },
+    {
+        "EventCode": "0x022a",
+        "EventName": "L1I_CACHE_HIT_HWPRF_FPRF",
+        "PublicDescription": "L1 I-cache hardware prefetch access first hit, fetched by hardware or software prefetch.\nThis event counts each hardware prefetch access first hit where the cache line was fetched in response to a hardware or prefetch instruction.\nOnly the first hit is counted. After this event is generated for a cache line, the event is not generated again for the same cache line while it remains in the cache."
+    }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/l2d_cache.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/l2d_cache.json
new file mode 100644
index 000000000000..66f21a94381e
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/l2d_cache.json
@@ -0,0 +1,134 @@
+[
+    {
+        "ArchStdEvent": "L2D_CACHE",
+        "PublicDescription": "This event counts accesses to the L2 cache due to data accesses. L2 cache is a unified cache for data and instruction accesses. Accesses are for misses in the L1 D-cache or translation resolutions due to accesses. This event also counts write-back of dirty data from L1 D-cache to the L2 cache.\nI-cache accesses are included in this event. This event is the sum of the following events:\nL2D_CACHE_RD,\nL2D_CACHE_WR,\nL2D_CACHE_PRFM, and\nL2D_CACHE_HWPRF."
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_REFILL",
+        "PublicDescription": "This event counts cache line refills into the L2 cache. L2 cache is a unified cache for data and instruction accesses. Accesses are for misses in the L1 D-cache or translation resolutions due to accesses.\nI-cache refills are included in this event. This event is the sum of the following events:\nL2D_CACHE_REFILL_RD,\nL2D_CACHE_REFILL_WR,\nL2D_CACHE_REFILL_HWPRF, and\nL2D_CACHE_REFILL_PRFM."
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_WB",
+        "PublicDescription": "This event counts write-backs of data from the L2 cache to outside the CPU. This includes snoops to the L2 (from other CPUs) which return data even if the snoops cause an invalidation. L2 cache line invalidations which do not write data outside the CPU and snoops which return data from an L1 cache are not counted. Data would not be written outside the cache when invalidating a clean cache line.\nThis event is the sum of the following events:\nL2D_CACHE_WB_VICTIM and\nL2D_CACHE_WB_CLEAN."
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_RD",
+        "PublicDescription": "This event counts L2 D-cache accesses due to memory Read operations. L2 cache is a unified cache for data and instruction accesses, accesses are for misses in the L1 D-cache or translation resolutions due to accesses.\nI-cache accesses are included in this event. This event is a subset of the L2D_CACHE event, but this event only counts memory Read operations."
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_WR",
+        "PublicDescription": "This event counts L2 cache accesses due to memory Write operations. L2 cache is a unified cache for data and instruction accesses, accesses are for misses in the L1 D-cache or translation resolutions due to accesses.\nThis event is a subset of the L2D_CACHE event, but this event only counts memory Write operations."
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_REFILL_RD",
+        "PublicDescription": "This event counts refills for memory accesses due to memory Read operation counted by L2D_CACHE_RD. L2 cache is a unified cache for data and instruction accesses, accesses are for misses in the L1 D-cache or translation resolutions due to accesses.\nThis CPU includes I-cache refills in this counter as an L2I equivalent event was not implemented. This event is a subset of the L2D_CACHE_REFILL event. This event does not count L2 refills caused by stashes into L2.\nThis count includes demand requests that encounter an L2 prefetch request or an L2 software prefetch request to the same cache line, which is still pending in the L2 LFB."
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_REFILL_WR",
+        "PublicDescription": "This event counts refills for memory accesses due to memory Write operation counted by L2D_CACHE_WR. L2 cache is a unified cache for data and instruction accesses, accesses are for misses in the L1 D-cache or translation resolutions due to accesses.\nThis count includes demand requests that encounter an L2 prefetch request or an L2 software prefetch request to the same cache line, which is still pending in the L2 LFB."
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_WB_VICTIM",
+        "PublicDescription": "This event counts evictions from the L2 cache because of a line being allocated into the L2 cache.\nThis event is a subset of the L2D_CACHE_WB event."
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_WB_CLEAN",
+        "PublicDescription": "This event counts write-backs from the L2 cache that are a result of any of the following:\n* Cache maintenance operations,\n* Snoop responses, or\n* Direct cache transfers to another CPU due to a forwarding snoop request.\nThis event is a subset of the L2D_CACHE_WB event."
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_INVAL",
+        "PublicDescription": "This event counts each explicit invalidation of a cache line in the L2 cache by cache maintenance operations that operate by a virtual address, or by external coherency operations. This event does not count if either:\n* A cache refill invalidates a cache line, or\n* A cache Maintenance Operation (CMO), which invalidates a cache line specified by Set/Way,\nis executed on that CPU.\nCMOs that operate by Set/Way cannot be broadcast from one CPU to another."
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_LMISS_RD",
+        "PublicDescription": "This event counts cache line refills into the L2 unified cache from any memory Read operations that incurred additional latency.\nCounts the same as L2D_CACHE_REFILL_RD in this CPU"
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_RW",
+        "PublicDescription": "This event counts L2 cache demand accesses from any Load/Store operations. L2 cache is a unified cache for data and instruction accesses, accesses are for misses in the L1 D-cache or translation resolutions due to accesses.\nI-cache accesses are included in this event.\nThis event is the sum of the following events:\nL2D_CACHE_RD and\nL2D_CACHE_WR."
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_PRFM",
+        "PublicDescription": "This event counts L2 D-cache accesses generated by software preload or prefetch instructions with target = L1/L2/L3 cache.\nNote that a software preload or prefetch instructions with (target = L1/L2/L3) that hits in L1D will not result in an L2 D-cache access. Therefore, such a software preload or prefetch instructions will not be counted by this event."
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_MISS",
+        "PublicDescription": "This event counts cache line misses in the L2 cache. L2 cache is a unified cache for data and instruction accesses. Accesses are for misses in the L1 D-cache or translation resolutions due to accesses.\nThis event counts the same as L2D_CACHE_REFILL_RD in this CPU."
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_REFILL_PRFM",
+        "PublicDescription": "This event counts refills due to accesses generated as a result of software preload or prefetch instructions as counted by L2D_CACHE_PRFM. I-cache refills are included in this event."
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_HWPRF",
+        "PublicDescription": "This event counts the L2 D-cache access caused by L1 or L2 hardware prefetcher."
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_REFILL_HWPRF",
+        "PublicDescription": "This event counts each hardware prefetch access counted by L2D_CACHE_HWPRF that causes a refill of the L2 cache, or any L1 Data, or Instruction cache of this PE, from outside of those caches.\nThis does not include prefetch requests pending waiting for a refill in LFB and a new demand request to the same cache line hitting the LFB entry. All such refills are counted as L2D_LFB_HIT_RWL1PRF_FHWPRF."
+    },
+    {
+        "ArchStdEvent": "L2D_CACHE_REFILL_PRF",
+        "PublicDescription": "This event counts each access to L2 Cache due to a prefetch instruction, or hardware prefetch that causes a refill of the L2 or any Level 1, from outside of those caches."
+    },
+    {
+        "EventCode": "0x0108",
+        "EventName": "L2D_CACHE_IF_REFILL",
+        "PublicDescription": "L2 D-cache refill, instruction fetch.\nThis event counts demand instruction fetch that causes a refill of the L2 cache or L1 cache of this PE, from outside of those caches."
+    },
+    {
+        "EventCode": "0x0109",
+        "EventName": "L2D_CACHE_TBW_REFILL",
+        "PublicDescription": "L2 D-cache refill, Page table walk.\nThis event counts demand translation table walk that causes a refill of the L2 cache or L1 cache of this PE, from outside of those caches."
+    },
+    {
+        "EventCode": "0x010a",
+        "EventName": "L2D_CACHE_PF_REFILL",
+        "PublicDescription": "L2 D-cache refill, prefetch.\nThis event counts L1 or L2 hardware or software prefetch accesses that causes a refill of the L2 cache or L1 cache of this PE, from outside of those caches."
+    },
+    {
+        "EventCode": "0x010b",
+        "EventName": "L2D_LFB_HIT_RWL1PRF_FHWPRF",
+        "PublicDescription": "L2 line fill buffer demand Read, demand Write or L1 prefetch first hit, fetched by hardware prefetch.\nThis event counts each of the following access that hit the line-fill buffer when the same cache line is already being fetched due to an L2 hardware prefetcher.\n* Demand Read or Write\n* L1I-HWPRF\n* L1D-HWPRF\n* L1I PRFM\n* L1D PRFM\nThese accesses hit a cache line that is currently being loaded into the L2 cache as a result of a hardware prefetcher to the same line. Consequently, this access does not initiate a new refill but waits for the completion of the previous refill.\nOnly the first hit is counted. After this event is generated for a cache line, the event is not generated again for the same cache line while it remains in the cache."
+    },
+    {
+        "EventCode": "0x0179",
+        "EventName": "L2D_CACHE_HIT_RWL1PRF_FHWPRF",
+        "PublicDescription": "L2 D-cache demand Read, demand Write and L1 prefetch hit, fetched by hardware prefetch. This event counts each demand Read, demand Write and L1 hardware or software prefetch request that hit an L2 D-cache line that was refilled into L2 D-cache in response to an L2 hardware prefetch. Only the first hit is counted. After this event is generated for a cache line, the event is not generated again for the same cache line while it remains in the cache."
+    },
+    {
+        "EventCode": "0x01b8",
+        "EventName": "L2D_CACHE_L1PRF",
+        "PublicDescription": "L2 D-cache access, L1 hardware or software prefetch. This event counts L1 Hardware or software prefetch access to L2 D-cache."
+    },
+    {
+        "EventCode": "0x01b9",
+        "EventName": "L2D_CACHE_REFILL_L1PRF",
+        "PublicDescription": "L2 D-cache refill, L1 hardware or software prefetch.\nThis event counts each access counted by L2D_CACHE_L1PRF that causes a refill of the L2 cache or any L1 cache of this PE, from outside of those caches."
+    },
+    {
+        "EventCode": "0x0201",
+        "EventName": "L2D_CACHE_BACKSNOOP_L1D_VIRT_ALIASING",
+        "PublicDescription": "This event counts when the L2 D-cache sends an invalidating back-snoop to the L1 D for an access initiated by the L1 D, where the corresponding line is already present in the L1 D-cache.\nThe L2 D-cache line tags the PE that refilled the line. It also retains specific bits of the VA to identify virtually aliased addresses.\nThe L1 D request requiring a back-snoop can originate either from the same PE that refilled the L2 D line or from a different PE. In either case, this event only counts those back snoop where the requested VA mismatch the VA stored in the L2 D tag.\nThis event is counted only by PE that initiated the original request necessitating a back-snoop.\nNote : The L1 D is VIPT, it identifies this access as a miss. Conversely, as L2 is PIPT, it identifies this as a hit. L2 D utilizes the back-snoop mechanism to refill L1 D with the snooped data."
+    },
+    {
+        "EventCode": "0x0208",
+        "EventName": "L2D_CACHE_RWL1PRF",
+        "PublicDescription": "L2 D-cache access, demand Read, demand Write or L1 hardware or software prefetch.\nThis event counts each access to L2 D-cache due to the following:\n* Demand Read or Write.\n* L1 Hardware or software prefetch."
+    },
+    {
+        "EventCode": "0x020a",
+        "EventName": "L2D_CACHE_REFILL_RWL1PRF",
+        "PublicDescription": "L2 D-cache refill, demand Read, demand Write or L1 hardware or software prefetch.\nThis event counts each access counted by L2D_CACHE_RWL1PRF that causes a refill of the L2 cache, or any L1 cache of this PE, from outside of those caches."
+    },
+    {
+        "EventCode": "0x020c",
+        "EventName": "L2D_CACHE_HIT_RWL1PRF_FPRFM",
+        "PublicDescription": "L2 D-cache demand Read, demand Write and L1 prefetch hit, fetched by software prefetch.\nThis event counts each demand Read, demand Write and L1 hardware or software prefetch request that hit an L2 D-cache line that was refilled into L2 D-cache in response to an L2 software prefetch. Only the first hit is counted. After this event is generated for a cache line, the event is not generated again for the same cache line while it remains in the cache."
+    },
+    {
+        "EventCode": "0x020e",
+        "EventName": "L2D_CACHE_HIT_RWL1PRF_FPRF",
+        "PublicDescription": "L2 D-cache demand Read, demand Write and L1 prefetch hit, fetched by software or hardware prefetch.\nThis event counts each demand Read, demand Write and L1 hardware or software prefetch request that hit an L2 D-cache line that was refilled into L2 D-cache in response to an L2 hardware prefetch or software prefetch. Only the first hit is counted. After this event is generated for a cache line, the event is not generated again for the same cache line while it remains in the cache."
+    }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/ll_cache.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/ll_cache.json
new file mode 100644
index 000000000000..851d0a70de9c
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/ll_cache.json
@@ -0,0 +1,107 @@
+[
+    {
+        "ArchStdEvent": "L3D_CACHE_ALLOCATE",
+        "PublicDescription": "This event counts each memory Write operation that writes an entire line into the L3 data without fetching data from outside the L3 Data. These are allocations of cache lines in the L3 Data that are not refills counted by\nL3D_CACHE_REFILL. For example:\nA Write-back of an entire cache line from an L2 cache to the L3 D-cache.\n* A Write of an entire cache line from a coalescing Write buffer.\n* An operation such as DC ZVA.\nThis counter does not count writes that write an entire line to beyond level 3. Thus this counter does not count the streaming writes to beyond L3 cache."
+    },
+    {
+        "ArchStdEvent": "L3D_CACHE_REFILL",
+        "PublicDescription": "This event counts each access counted by L3D_CACHE that causes a refill of the L3 Data, or any L1 Data, instruction or L2 cache of this PE, from outside of those caches. This includes the refill due to hardware prefetch and software prefetch accesses.\nThis event is a sum of L3D_CACHE_MISS, L3D_CACHE_REFILL_PRFM and L3D_CACHE_REFILL_HWPRF event.\nA refill includes any access that causes data to be fetched from outside of the L1 to L3 caches, even if the data is ultimately not allocated into the L3 D-cache."
+    },
+    {
+        "ArchStdEvent": "L3D_CACHE",
+        "PublicDescription": "This event counts each memory Read operation or memory Write operation that causes a cache access to the Level 3.\nThis event is a sum of the following Events:\n* L3D_CACHE_RD(0x00a0)\n* L3D_CACHE_ALLOCATE(0x0029)\n* L3D_CACHE_PRFM(0x8151)\n* L3D_CACHE_HWPRF(0x8156)\n* L2D_CACHE_WB(0x0018)"
+    },
+    {
+        "ArchStdEvent": "LL_CACHE_RD",
+        "PublicDescription": "This is an alias to the event L3D_CACHE_RD (0x00a0)."
+    },
+    {
+        "ArchStdEvent": "LL_CACHE_MISS_RD",
+        "PublicDescription": "This is an alias to the event L3D_CACHE_REFILL_RD (0x00a2)."
+    },
+    {
+        "ArchStdEvent": "L3D_CACHE_RD",
+        "PublicDescription": "This event counts each Memory Read operation to L3 D-cache from instruction fetch, Load/Store, and MMU translation table accesses. This does not include hardware prefetcher or PRFM instruction accesses. This include L1 and L2 prefetcher accesses to L3 D-cache."
+    },
+    {
+        "ArchStdEvent": "L3D_CACHE_REFILL_RD",
+        "PublicDescription": "This event counts each access counted by both L3D_CACHE_RD and L3D_CACHE_REFILL. That is, every refill of the L3 cache counted by L3D_CACHE_REFILL that is caused by a Memory Read operation.\nThe L3D_CACHE_MISS(0x8152), L3D_CACHE_REFILL_RD (0x00a2) and L3D_CACHE_LMISS_RD(0x400b) count the same event in the hardware."
+    },
+    {
+        "ArchStdEvent": "L3D_CACHE_LMISS_RD",
+        "PublicDescription": "This event counts each memory Read operation to the L3 cache counted by L3D_CACHE that incurs additional latency because it returns data from outside of the L1 to L3 caches.\nThe L3D_CACHE_MISS(0x8152), L3D_CACHE_REFILL_RD (0x00a2) and L3D_CACHE_LMISS_RD(0x400b) count the same event in the hardware."
+    },
+    {
+        "ArchStdEvent": "L3D_CACHE_RW",
+        "PublicDescription": "This event counts each access counted by L3D_CACHE that is due to a demand memory Read operation or demand memory Write operation.\nThis event is a sum of L3D_CACHE_RD(0x00a0), L3D_CACHE_ALLOCATE(0x0029) and L2D_CACHE_WB(0x0018).\nNote that this counter does not count that writes an entire line to beyond level 3. Thus this counter does not count the streaming Writes to beyond L3 cache."
+    },
+    {
+        "ArchStdEvent": "L3D_CACHE_PRFM",
+        "PublicDescription": "This event counts each access counted by L3D_CACHE that is due to a prefetch instruction. This includes L3 Data accesses due to the L1, L2, or L3 prefetch instruction."
+    },
+    {
+        "ArchStdEvent": "L3D_CACHE_MISS",
+        "PublicDescription": "This event counts each demand Read access counted by L3D_CACHE_RD that misses in the L1 to L3 Data, causing an access to outside of the L3 cache.\nThe L3D_CACHE_MISS(0x8152), L3D_CACHE_REFILL_RD (0x00a2) and L3D_CACHE_LMISS_RD(0x400b) count the same event in the hardware."
+    },
+    {
+        "ArchStdEvent": "L3D_CACHE_REFILL_PRFM",
+        "PublicDescription": "This event counts each access counted by L3D_CACHE_PRFM that causes a refill of the L3 cache, or any L1 or L2 Data, from outside of those caches."
+    },
+    {
+        "ArchStdEvent": "L3D_CACHE_HWPRF",
+        "PublicDescription": "This event counts each access to L3 cache that is due to a hardware prefetcher. This includes L3D accesses due to the Level-1 or Level-2 or Level-3 hardware prefetcher."
+    },
+    {
+        "ArchStdEvent": "L3D_CACHE_REFILL_HWPRF",
+        "PublicDescription": "This event counts each hardware prefetch counted by L3D_CACHE_HWPRF that causes a refill of the L3 Data or unified cache, or any L1 or L2 Data, Instruction, or unified cache of this PE, from outside of those caches."
+    },
+    {
+        "ArchStdEvent": "L3D_CACHE_REFILL_PRF",
+        "PublicDescription": "This event counts each access to L3 cache due to a prefetch instruction, or hardware prefetch that causes a refill of the L3 Data, or any L1 or L2 Data, from outside of those caches."
+    },
+    {
+        "EventCode": "0x01e8",
+        "EventName": "L3D_CACHE_RWL1PRFL2PRF",
+        "PublicDescription": "L3 cache access, demand Read, demand Write, L1 hardware or software prefetch or L2 hardware or software prefetch.\nThis event counts each access to L3 D-cache due to the following:\n* Demand Read or Write.\n* L1 Hardware or software prefetch.\n* L2 Hardware or software prefetch."
+    },
+    {
+        "EventCode": "0x01e9",
+        "EventName": "L3D_CACHE_REFILL_RWL1PRFL2PRF",
+        "PublicDescription": "L3 cache refill, demand Read, demand Write, L1 hardware or software prefetch or L2 hardware or software prefetch.\nThis event counts each access counted by L3D_CACHE_RWL1PRFL2PRF that causes a refill of the L3 cache, or any L1 or L2 cache of this PE, from outside of those caches."
+    },
+    {
+        "EventCode": "0x01f6",
+        "EventName": "L3D_CACHE_REFILL_L2PRF",
+        "PublicDescription": "This event counts each access counted by L3D_CACHE_L2PRF that causes a refill of the L3 cache, or any L1 or L2 cache of this PE, from outside of those caches."
+    },
+    {
+        "EventCode": "0x01f7",
+        "EventName": "L3D_CACHE_HIT_RWL1PRFL2PRF_FPRF",
+        "PublicDescription": "L3 cache demand Read, demand Write, L1 prefetch L2 prefetch first hit, fetched by software or hardware prefetch.\nThis event counts each demand Read, demand Write, L1 hardware or software prefetch request and L2 hardware or software prefetch that hit an L3 D-cache line that was refilled into L3 D-cache in response to an L3 hardware prefetch or software prefetch. Only the first hit is counted. After this event is generated for a cache line, the event is not generated again for the same cache line while it remains in the cache."
+    },
+    {
+        "EventCode": "0x0225",
+        "EventName": "L3D_CACHE_REFILL_IF",
+        "PublicDescription": "L3 cache refill, instruction fetch.\nThis event counts demand instruction fetch that causes a refill of the L3 cache, or any L1 or L2 cache of this PE, from outside of those caches."
+    },
+    {
+        "EventCode": "0x0226",
+        "EventName": "L3D_CACHE_REFILL_MM",
+        "PublicDescription": "L3 cache refill, translation table walk access.\nThis event counts demand translation table access that causes a refill of the L3 cache, or any L1 or L2 cache of this PE, from outside of those caches."
+    },
+    {
+        "EventCode": "0x0227",
+        "EventName": "L3D_CACHE_REFILL_L1PRF",
+        "PublicDescription": "This event counts each access counted by L3D_CACHE_L1PRF that causes a refill of the L3 cache, or any L1 or L2 cache of this PE, from outside of those caches."
+    },
+    {
+        "EventCode": "0x022c",
+        "EventName": "L3D_CACHE_L1PRF",
+        "PublicDescription": "This event counts the L3 D-cache access due to L1 hardware prefetch or software prefetch request.\nThe L1 hardware prefetch or software prefetch requests that miss the L1I, L1D and L2 D-cache are counted by this counter"
+    },
+    {
+        "EventCode": "0x022d",
+        "EventName": "L3D_CACHE_L2PRF",
+        "PublicDescription": "This event counts the L3 D-cache access due to L2 hardware prefetch or software prefetch request.\nThe L2 hardware prefetch or software prefetch requests that miss the L2 D-cache are counted by this counter"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/memory.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/memory.json
new file mode 100644
index 000000000000..becd2d90bf39
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/memory.json
@@ -0,0 +1,46 @@
+[
+    {
+        "ArchStdEvent": "MEM_ACCESS",
+        "PublicDescription": "This event counts memory accesses issued by the CPU load/store unit, where those accesses are issued due to load or store operations. This event counts memory accesses regardless of whether the data is received from any level of cache hierarchy or external memory. If memory accesses are broken up into smaller transactions than what were specified in the load or store instructions, then the event counts those smaller memory transactions.\nMemory accesses generated by the following instructions or activity are not counted: instruction fetches, cache maintenance instructions, translation table walks or prefetches, memory prefetch operations. This event counts the sum of the following events:\nMEM_ACCESS_RD and\nMEM_ACCESS_WR."
+    },
+    {
+        "ArchStdEvent": "MEMORY_ERROR",
+        "PublicDescription": "This event counts any detected correctable or uncorrectable physical memory errors (ECC or parity) in protected CPU RAMs. On the Core, this event counts errors in the caches (including data and tag RAMs). Any detected memory error (from either a speculative and abandoned access, or an architecturally executed access) is counted.\nNote that errors are only detected when the actual protected memory is accessed by an operation."
+    },
+    {
+        "ArchStdEvent": "REMOTE_ACCESS",
+        "PublicDescription": "This event counts each external bus read access that causes an access to a remote device. That is, a socket that does not contain the PE."
+    },
+    {
+        "ArchStdEvent": "MEM_ACCESS_RD",
+        "PublicDescription": "This event counts memory accesses issued by the CPU due to Load operations. This event counts any memory Load access, no matter whether the data is received from any level of cache hierarchy or external memory. This event also counts atomic Load operations. If memory accesses are broken up by the Load/Store unit into smaller transactions that are issued by the bus interface, then the event counts those smaller transactions.\nThe following instructions are not counted:\n1) Instruction fetches,\n2) Cache maintenance instructions,\n3) Translation table walks or prefetches,\n4) Memory prefetch operations.\nThis event is a subset of the MEM_ACCESS event but the event only counts memory-Read operations."
+    },
+    {
+        "ArchStdEvent": "MEM_ACCESS_WR",
+        "PublicDescription": "This event counts memory accesses issued by the CPU due to Store operations. This event counts any memory Store access, no matter whether the data is located in any level of cache or external memory. This event also counts atomic Load and Store operations. If memory accesses are broken up by the Load/Store unit into smaller transactions that are issued by the bus interface, then the event counts those smaller transactions."
+    },
+    {
+        "ArchStdEvent": "LDST_ALIGN_LAT",
+        "PublicDescription": "This event counts the number of memory Read and Write accesses in a cycle that incurred additional latency due to the alignment of the address and the size of data being accessed, which results in a store crossing a single cache line.\nThis event is implemented as the sum of the following events on this CPU:\nLD_ALIGN_LAT and\nST_ALIGN_LAT."
+    },
+    {
+        "ArchStdEvent": "LD_ALIGN_LAT",
+        "PublicDescription": "This event counts the number of memory Read accesses in a cycle that incurred additional latency due to the alignment of the address and size of data being accessed, which results in a load crossing a single cache line."
+    },
+    {
+        "ArchStdEvent": "ST_ALIGN_LAT",
+        "PublicDescription": "This event counts the number of memory Write accesses in a cycle that incurred additional latency due to the alignment of the address and size of data being accessed."
+    },
+    {
+        "ArchStdEvent": "INST_FETCH_PERCYC",
+        "PublicDescription": "This event counts number of instruction fetches outstanding per cycle, which will provide an average latency of instruction fetch."
+    },
+    {
+        "ArchStdEvent": "MEM_ACCESS_RD_PERCYC",
+        "PublicDescription": "This event counts the number of outstanding Loads or memory Read accesses per cycle."
+    },
+    {
+        "ArchStdEvent": "INST_FETCH",
+        "PublicDescription": "This event counts instruction memory accesses that the PE makes."
+    }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/metrics.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/metrics.json
new file mode 100644
index 000000000000..b825ede03f54
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/metrics.json
@@ -0,0 +1,722 @@
+[
+    {
+        "MetricName": "backend_bound",
+        "MetricExpr": "100 * (STALL_SLOT_BACKEND / CPU_SLOT)",
+        "BriefDescription": "This metric is the percentage of total slots that were stalled due to resource constraints in the backend of the processor.",
+        "ScaleUnit": "1percent of slots",
+        "MetricGroup": "TopdownL1"
+    },
+    {
+        "MetricName": "backend_busy_bound",
+        "MetricExpr": "100 * (STALL_BACKEND_BUSY / STALL_BACKEND)",
+        "BriefDescription": "This metric is the percentage of total cycles stalled in the backend due to issue queues being full to accept operations for execution.",
+        "ScaleUnit": "1percent of cycles",
+        "MetricGroup": "Topdown_Backend"
+    },
+    {
+        "MetricName": "backend_cache_l1d_bound",
+        "MetricExpr": "100 * (STALL_BACKEND_L1D / (STALL_BACKEND_L1D + STALL_BACKEND_MEM))",
+        "BriefDescription": "This metric is the percentage of total cycles stalled in the backend due to memory access latency issues caused by L1 D-cache misses.",
+        "ScaleUnit": "1percent of cycles",
+        "MetricGroup": "Topdown_Backend"
+    },
+    {
+        "MetricName": "backend_cache_l2d_bound",
+        "MetricExpr": "100 * (STALL_BACKEND_MEM / (STALL_BACKEND_L1D + STALL_BACKEND_MEM))",
+        "BriefDescription": "This metric is the percentage of total cycles stalled in the backend due to memory access latency issues caused by L2 D-cache misses.",
+        "ScaleUnit": "1percent of cycles",
+        "MetricGroup": "Topdown_Backend"
+    },
+    {
+        "MetricName": "backend_core_bound",
+        "MetricExpr": "100 * (STALL_BACKEND_CPUBOUND / STALL_BACKEND)",
+        "BriefDescription": "This metric is the percentage of total cycles stalled in the backend due to backend Core resource constraints not related to instruction fetch latency issues caused by memory access components.",
+        "ScaleUnit": "1percent of cycles",
+        "MetricGroup": "Topdown_Backend"
+    },
+    {
+        "MetricName": "backend_core_rename_bound",
+        "MetricExpr": "100 * (STALL_BACKEND_RENAME / STALL_BACKEND_CPUBOUND)",
+        "BriefDescription": "This metric is the percentage of total cycles stalled in the backend as the rename unit registers are unavailable.",
+        "ScaleUnit": "1percent of cycles",
+        "MetricGroup": "Topdown_Backend"
+    },
+    {
+        "MetricName": "backend_mem_bound",
+        "MetricExpr": "100 * (STALL_BACKEND_MEMBOUND / STALL_BACKEND)",
+        "BriefDescription": "This metric is the percentage of total cycles stalled in the backend due to backend Core resource constraints related to memory access latency issues caused by memory access components.",
+        "ScaleUnit": "1percent of cycles",
+        "MetricGroup": "Topdown_Backend"
+    },
+    {
+        "MetricName": "backend_mem_cache_bound",
+        "MetricExpr": "100 * ((STALL_BACKEND_L1D + STALL_BACKEND_MEM) / STALL_BACKEND_MEMBOUND)",
+        "BriefDescription": "This metric is the percentage of total cycles stalled in the backend due to memory latency issues caused by D-cache misses.",
+        "ScaleUnit": "1percent of cycles",
+        "MetricGroup": "Topdown_Backend"
+    },
+    {
+        "MetricName": "backend_mem_store_bound",
+        "MetricExpr": "100 * (STALL_BACKEND_ST / STALL_BACKEND_MEMBOUND)",
+        "BriefDescription": "This metric is the percentage of total cycles stalled in the backend due to memory Write pending caused by Stores stalled in the pre-commit stage.",
+        "ScaleUnit": "1percent of cycles",
+        "MetricGroup": "Topdown_Backend"
+    },
+    {
+        "MetricName": "backend_mem_tlb_bound",
+        "MetricExpr": "100 * (STALL_BACKEND_TLB / STALL_BACKEND_MEMBOUND)",
+        "BriefDescription": "This metric is the percentage of total cycles stalled in the backend due to memory access latency issues caused by Data TLB misses.",
+        "ScaleUnit": "1percent of cycles",
+        "MetricGroup": "Topdown_Backend"
+    },
+    {
+        "MetricName": "backend_stalled_cycles",
+        "MetricExpr": "100 * (STALL_BACKEND / CPU_CYCLES)",
+        "BriefDescription": "This metric is the percentage of cycles that were stalled due to resource constraints in the backend unit of the processor.",
+        "ScaleUnit": "1percent of cycles",
+        "MetricGroup": "Cycle_Accounting"
+    },
+    {
+        "MetricName": "bad_speculation",
+        "MetricExpr": "100 - (frontend_bound + retiring + backend_bound)",
+        "BriefDescription": "This metric is the percentage of total slots that executed operations and didn't retire due to a pipeline flush. This indicates cycles that were utilized but inefficiently.",
+        "ScaleUnit": "1percent of slots",
+        "MetricGroup": "TopdownL1"
+    },
+    {
+        "MetricName": "barrier_percentage",
+        "MetricExpr": "100 * ((ISB_SPEC + DSB_SPEC + DMB_SPEC) / INST_SPEC)",
+        "BriefDescription": "This metric measures instruction and data barrier operations as a percentage of operations speculatively executed.",
+        "ScaleUnit": "1percent of operations",
+        "MetricGroup": "Operation_Mix"
+    },
+    {
+        "MetricName": "branch_direct_ratio",
+        "MetricExpr": "BR_IMMED_RETIRED / BR_RETIRED",
+        "BriefDescription": "This metric measures the ratio of direct branches retired to the total number of branches architecturally executed.",
+        "ScaleUnit": "1per branch",
+        "MetricGroup": "Branch_Effectiveness"
+    },
+    {
+        "MetricName": "branch_indirect_ratio",
+        "MetricExpr": "BR_IND_RETIRED / BR_RETIRED",
+        "BriefDescription": "This metric measures the ratio of indirect branches retired, including function returns, to the total number of branches architecturally executed.",
+        "ScaleUnit": "1per branch",
+        "MetricGroup": "Branch_Effectiveness"
+    },
+    {
+        "MetricName": "branch_misprediction_ratio",
+        "MetricExpr": "BR_MIS_PRED_RETIRED / BR_RETIRED",
+        "BriefDescription": "This metric measures the ratio of branches mispredicted to the total number of branches architecturally executed. This gives an indication of the effectiveness of the branch prediction unit.",
+        "ScaleUnit": "1per branch",
+        "MetricGroup": "Miss_Ratio;Branch_Effectiveness"
+    },
+    {
+        "MetricName": "branch_mpki",
+        "MetricExpr": "1000 * (BR_MIS_PRED_RETIRED / INST_RETIRED)",
+        "BriefDescription": "This metric measures the number of branch mispredictions per thousand instructions executed.",
+        "ScaleUnit": "1MPKI",
+        "MetricGroup": "MPKI;Branch_Effectiveness"
+    },
+    {
+        "MetricName": "branch_percentage",
+        "MetricExpr": "100 * ((BR_IMMED_SPEC + BR_INDIRECT_SPEC) / INST_SPEC)",
+        "BriefDescription": "This metric measures branch operations as a percentage of operations speculatively executed.",
+        "ScaleUnit": "1percent of operations",
+        "MetricGroup": "Operation_Mix"
+    },
+    {
+        "MetricName": "branch_return_ratio",
+        "MetricExpr": "BR_RETURN_RETIRED / BR_RETIRED",
+        "BriefDescription": "This metric measures the ratio of branches retired that are function returns to the total number of branches architecturally executed.",
+        "ScaleUnit": "1per branch",
+        "MetricGroup": "Branch_Effectiveness"
+    },
+    {
+        "MetricName": "bus_bandwidth",
+        "MetricExpr": "BUS_ACCESS * 32 / duration_time ",
+        "BriefDescription": "This metric measures the bus-bandwidth of the data transferred between this PE's L2 with unCore in the system.",
+        "ScaleUnit": "1Bytes/sec"
+    },
+    {
+        "MetricName": "cpu_cycles_fraction_in_st_mode",
+        "MetricExpr": "((CPU_SLOT/CPU_CYCLES) - 5) / 5",
+        "BriefDescription": "This metric counts fraction of the CPU cycles spent in ST mode during program execution.",
+        "ScaleUnit": "1fraction of cycles",
+        "MetricGroup": "SMT"
+    },
+    {
+        "MetricName": "cpu_cycles_in_smt_mode",
+        "MetricExpr": "(1 - cpu_cycles_fraction_in_st_mode) * CPU_CYCLES",
+        "BriefDescription": "This metric counts CPU cycles in SMT mode during program execution.",
+        "ScaleUnit": "1CPU cycles",
+        "MetricGroup": "SMT"
+    },
+    {
+        "MetricName": "cpu_cycles_in_st_mode",
+        "MetricExpr": "cpu_cycles_fraction_in_st_mode * CPU_CYCLES",
+        "BriefDescription": "This metric counts CPU cycles in ST mode during program execution.",
+        "ScaleUnit": "1CPU cycles",
+        "MetricGroup": "SMT"
+    },
+    {
+        "MetricName": "crypto_percentage",
+        "MetricExpr": "100 * (CRYPTO_SPEC / INST_SPEC)",
+        "BriefDescription": "This metric measures crypto operations as a percentage of operations speculatively executed.",
+        "ScaleUnit": "1percent of operations",
+        "MetricGroup": "Operation_Mix"
+    },
+    {
+        "MetricName": "dtlb_mpki",
+        "MetricExpr": "1000 * (DTLB_WALK / INST_RETIRED)",
+        "BriefDescription": "This metric measures the number of Data TLB Walks per thousand instructions executed.",
+        "ScaleUnit": "1MPKI",
+        "MetricGroup": "MPKI;DTLB_Effectiveness"
+    },
+    {
+        "MetricName": "dtlb_walk_average_latency",
+        "MetricExpr": "DTLB_WALK_PERCYC / DTLB_WALK",
+        "BriefDescription": "This metric measures the average latency of Data TLB walks in CPU cycles.",
+        "ScaleUnit": "1CPU cycles",
+        "MetricGroup": "Average_Latency"
+    },
+    {
+        "MetricName": "dtlb_walk_ratio",
+        "MetricExpr": "DTLB_WALK / L1D_TLB",
+        "BriefDescription": "This metric measures the ratio of Data TLB Walks to the total number of Data TLB accesses. This gives an indication of the effectiveness of the Data TLB accesses.",
+        "ScaleUnit": "1per TLB access",
+        "MetricGroup": "Miss_Ratio;DTLB_Effectiveness"
+    },
+    {
+        "MetricName": "fp16_percentage",
+        "MetricExpr": "100 * (FP_HP_SPEC / INST_SPEC)",
+        "BriefDescription": "This metric measures half-precision floating point operations as a percentage of operations speculatively executed.",
+        "ScaleUnit": "1percent of operations",
+        "MetricGroup": "FP_Precision_Mix"
+    },
+    {
+        "MetricName": "fp32_percentage",
+        "MetricExpr": "100 * (FP_SP_SPEC / INST_SPEC)",
+        "BriefDescription": "This metric measures single-precision floating point operations as a percentage of operations speculatively executed.",
+        "ScaleUnit": "1percent of operations",
+        "MetricGroup": "FP_Precision_Mix"
+    },
+    {
+        "MetricName": "fp64_percentage",
+        "MetricExpr": "100 * (FP_DP_SPEC / INST_SPEC)",
+        "BriefDescription": "This metric measures double-precision floating point operations as a percentage of operations speculatively executed.",
+        "ScaleUnit": "1percent of operations",
+        "MetricGroup": "FP_Precision_Mix"
+    },
+    {
+        "MetricName": "fp_ops_per_cycle",
+        "MetricExpr": "(FP_SCALE_OPS_SPEC + FP_FIXED_OPS_SPEC) / CPU_CYCLES",
+        "BriefDescription": "This metric measures floating point operations per cycle in any precision performed by any instruction. Operations are counted by computation and by vector lanes, fused computations such as multiply-add count as twice per vector lane for example.",
+        "ScaleUnit": "1operations per cycle",
+        "MetricGroup": "FP_Arithmetic_Intensity"
+    },
+    {
+        "MetricName": "frontend_bound",
+        "MetricExpr": "100 * (STALL_SLOT_FRONTEND_WITHOUT_MISPRED / CPU_SLOT)",
+        "BriefDescription": "This metric is the percentage of total slots that were stalled due to resource constraints in the frontend of the processor.",
+        "ScaleUnit": "1percent of slots",
+        "MetricGroup": "TopdownL1"
+    },
+    {
+        "MetricName": "frontend_cache_l1i_bound",
+        "MetricExpr": "100 * (STALL_FRONTEND_L1I / (STALL_FRONTEND_L1I + STALL_FRONTEND_MEM))",
+        "BriefDescription": "This metric is the percentage of total cycles stalled in the frontend due to memory access latency issues caused by L1 I-cache misses.",
+        "ScaleUnit": "1percent of cycles",
+        "MetricGroup": "Topdown_Frontend"
+    },
+    {
+        "MetricName": "frontend_cache_l2i_bound",
+        "MetricExpr": "100 * (STALL_FRONTEND_MEM / (STALL_FRONTEND_L1I + STALL_FRONTEND_MEM))",
+        "BriefDescription": "This metric is the percentage of total cycles stalled in the frontend due to memory access latency issues caused by L2 I-cache misses.",
+        "ScaleUnit": "1percent of cycles",
+        "MetricGroup": "Topdown_Frontend"
+    },
+    {
+        "MetricName": "frontend_core_bound",
+        "MetricExpr": "100 * (STALL_FRONTEND_CPUBOUND / STALL_FRONTEND)",
+        "BriefDescription": "This metric is the percentage of total cycles stalled in the frontend due to frontend Core resource constraints not related to instruction fetch latency issues caused by memory access components.",
+        "ScaleUnit": "1percent of cycles",
+        "MetricGroup": "Topdown_Frontend"
+    },
+    {
+        "MetricName": "frontend_core_flow_bound",
+        "MetricExpr": "100 * (STALL_FRONTEND_FLOW / STALL_FRONTEND_CPUBOUND)",
+        "BriefDescription": "This metric is the percentage of total cycles stalled in the frontend as the decode unit is awaiting input from the branch prediction unit.",
+        "ScaleUnit": "1percent of cycles",
+        "MetricGroup": "Topdown_Frontend"
+    },
+    {
+        "MetricName": "frontend_core_flush_bound",
+        "MetricExpr": "100 * (STALL_FRONTEND_FLUSH / STALL_FRONTEND_CPUBOUND)",
+        "BriefDescription": "This metric is the percentage of total cycles stalled in the frontend as the processor is recovering from a pipeline flush caused by bad speculation or other machine resteers.",
+        "ScaleUnit": "1percent of cycles",
+        "MetricGroup": "Topdown_Frontend"
+    },
+    {
+        "MetricName": "frontend_mem_bound",
+        "MetricExpr": "100 * (STALL_FRONTEND_MEMBOUND / STALL_FRONTEND)",
+        "BriefDescription": "This metric is the percentage of total cycles stalled in the frontend due to frontend Core resource constraints related to the instruction fetch latency issues caused by memory access components.",
+        "ScaleUnit": "1percent of cycles",
+        "MetricGroup": "Topdown_Frontend"
+    },
+    {
+        "MetricName": "frontend_mem_cache_bound",
+        "MetricExpr": "100 * ((STALL_FRONTEND_L1I + STALL_FRONTEND_MEM) / STALL_FRONTEND_MEMBOUND)",
+        "BriefDescription": "This metric is the percentage of total cycles stalled in the frontend due to instruction fetch latency issues caused by I-cache misses.",
+        "ScaleUnit": "1percent of cycles",
+        "MetricGroup": "Topdown_Frontend"
+    },
+    {
+        "MetricName": "frontend_mem_tlb_bound",
+        "MetricExpr": "100 * (STALL_FRONTEND_TLB / STALL_FRONTEND_MEMBOUND)",
+        "BriefDescription": "This metric is the percentage of total cycles stalled in the frontend due to instruction fetch latency issues caused by Instruction TLB misses.",
+        "ScaleUnit": "1percent of cycles",
+        "MetricGroup": "Topdown_Frontend"
+    },
+    {
+        "MetricName": "frontend_stalled_cycles",
+        "MetricExpr": "100 * (STALL_FRONTEND / CPU_CYCLES)",
+        "BriefDescription": "This metric is the percentage of cycles that were stalled due to resource constraints in the frontend unit of the processor.",
+        "ScaleUnit": "1percent of cycles",
+        "MetricGroup": "Cycle_Accounting"
+    },
+    {
+        "MetricName": "instruction_fetch_average_latency",
+        "MetricExpr": "INST_FETCH_PERCYC / INST_FETCH",
+        "BriefDescription": "This metric measures the average latency of instruction fetches in CPU cycles.",
+        "ScaleUnit": "1CPU cycles",
+        "MetricGroup": "Average_Latency"
+    },
+    {
+        "MetricName": "integer_dp_percentage",
+        "MetricExpr": "100 * (DP_SPEC / INST_SPEC)",
+        "BriefDescription": "This metric measures scalar integer operations as a percentage of operations speculatively executed.",
+        "ScaleUnit": "1percent of operations",
+        "MetricGroup": "Operation_Mix"
+    },
+    {
+        "MetricName": "ipc",
+        "MetricExpr": "INST_RETIRED / CPU_CYCLES",
+        "BriefDescription": "This metric measures the number of instructions retired per cycle.",
+        "ScaleUnit": "1per cycle",
+        "MetricGroup": "General"
+    },
+    {
+        "MetricName": "itlb_mpki",
+        "MetricExpr": "1000 * (ITLB_WALK / INST_RETIRED)",
+        "BriefDescription": "This metric measures the number of instruction TLB Walks per thousand instructions executed.",
+        "ScaleUnit": "1MPKI",
+        "MetricGroup": "MPKI;ITLB_Effectiveness"
+    },
+    {
+        "MetricName": "itlb_walk_average_latency",
+        "MetricExpr": "ITLB_WALK_PERCYC / ITLB_WALK",
+        "BriefDescription": "This metric measures the average latency of instruction TLB walks in CPU cycles.",
+        "ScaleUnit": "1CPU cycles",
+        "MetricGroup": "Average_Latency"
+    },
+    {
+        "MetricName": "itlb_walk_ratio",
+        "MetricExpr": "ITLB_WALK / L1I_TLB",
+        "BriefDescription": "This metric measures the ratio of instruction TLB Walks to the total number of Instruction TLB accesses. This gives an indication of the effectiveness of the Instruction TLB accesses.",
+        "ScaleUnit": "1per TLB access",
+        "MetricGroup": "Miss_Ratio;ITLB_Effectiveness"
+    },
+    {
+        "MetricName": "l1d_cache_miss_ratio",
+        "MetricExpr": "L1D_CACHE_REFILL / L1D_CACHE",
+        "BriefDescription": "This metric measures the ratio of L1 D-cache accesses missed to the total number of L1 D-cache accesses. This gives an indication of the effectiveness of the L1 D-cache.",
+        "ScaleUnit": "1per cache access",
+        "MetricGroup": "Miss_Ratio;L1D_Cache_Effectiveness"
+    },
+    {
+        "MetricName": "l1d_cache_mpki",
+        "MetricExpr": "1000 * (L1D_CACHE_REFILL / INST_RETIRED)",
+        "BriefDescription": "This metric measures the number of L1 D-cache accesses missed per thousand instructions executed.",
+        "ScaleUnit": "1MPKI",
+        "MetricGroup": "MPKI;L1D_Cache_Effectiveness"
+    },
+    {
+        "MetricName": "l1d_cache_rw_miss_ratio",
+        "MetricExpr": "l1d_demand_misses / l1d_demand_accesses",
+        "BriefDescription": "This metric measures the ratio of L1 D-cache Read accesses missed to the total number of L1 D-cache accesses. This gives an indication of the effectiveness of the L1 D-cache for demand Load or Store traffic.",
+        "ScaleUnit": "1per cache access",
+        "MetricGroup": "L1I_Prefetcher_Effectiveness"
+    },
+    {
+        "MetricName": "l1d_demand_accesses",
+        "MetricExpr": "L1D_CACHE_RW",
+        "BriefDescription": "This metric measures the count of L1 D-cache accesses incurred on Load or Store by the instruction stream of the program.",
+        "ScaleUnit": "1count",
+        "MetricGroup": "L1I_Prefetcher_Effectiveness"
+    },
+    {
+        "MetricName": "l1d_demand_misses",
+        "MetricExpr": "L1D_CACHE_REFILL_RW",
+        "BriefDescription": "This metric measures the count of L1 D-cache misses incurred on a Load or Store by the instruction stream of the program.",
+        "ScaleUnit": "1count",
+        "MetricGroup": "L1I_Prefetcher_Effectiveness"
+    },
+    {
+        "MetricName": "l1d_prf_accuracy",
+        "MetricExpr": "100 * (l1d_useful_prf / l1d_refilled_prf)",
+        "BriefDescription": "This metric measures the fraction of prefetched memory addresses that are used by the instruction stream.",
+        "ScaleUnit": "1percent of prefetch",
+        "MetricGroup": "L1I_Prefetcher_Effectiveness"
+    },
+    {
+        "MetricName": "l1d_prf_coverage",
+        "MetricExpr": "100 * (l1d_useful_prf / (l1d_demand_misses + l1d_refilled_prf))",
+        "BriefDescription": "This metric measures the baseline demand cache misses which the prefetcher brings into the cache.",
+        "ScaleUnit": "1percent of cache access",
+        "MetricGroup": "L1I_Prefetcher_Effectiveness"
+    },
+    {
+        "MetricName": "l1d_refilled_prf",
+        "MetricExpr": "L1D_CACHE_REFILL_HWPRF + L1D_CACHE_REFILL_PRFM + L1D_LFB_HIT_RW_FHWPRF + L1D_LFB_HIT_RW_FPRFM",
+        "BriefDescription": "This metric measures the count of cache lines refilled by L1 data prefetcher (hardware prefetches or software preload) into L1 D-cache.",
+        "ScaleUnit": "1count",
+        "MetricGroup": "L1I_Prefetcher_Effectiveness"
+    },
+    {
+        "MetricName": "l1d_tlb_miss_ratio",
+        "MetricExpr": "L1D_TLB_REFILL / L1D_TLB",
+        "BriefDescription": "This metric measures the ratio of L1 Data TLB accesses missed to the total number of L1 Data TLB accesses. This gives an indication of the effectiveness of the L1 Data TLB.",
+        "ScaleUnit": "1per TLB access",
+        "MetricGroup": "Miss_Ratio;DTLB_Effectiveness"
+    },
+    {
+        "MetricName": "l1d_tlb_mpki",
+        "MetricExpr": "1000 * (L1D_TLB_REFILL / INST_RETIRED)",
+        "BriefDescription": "This metric measures the number of L1 Data TLB accesses missed per thousand instructions executed.",
+        "ScaleUnit": "1MPKI",
+        "MetricGroup": "MPKI;DTLB_Effectiveness"
+    },
+    {
+        "MetricName": "l1d_useful_prf",
+        "MetricExpr": "L1D_CACHE_HIT_RW_FPRF + L1D_LFB_HIT_RW_FHWPRF + L1D_LFB_HIT_RW_FPRFM",
+        "BriefDescription": "This metric measures the count of cache lines refilled by L1 data prefetcher (hardware prefetches or software preload) into L1 D-cache which are further used by Load or Store from the instruction stream of the program.",
+        "ScaleUnit": "1count",
+        "MetricGroup": "L1I_Prefetcher_Effectiveness"
+    },
+    {
+        "MetricName": "l1i_cache_miss_ratio",
+        "MetricExpr": "L1I_CACHE_REFILL / L1I_CACHE",
+        "BriefDescription": "This metric measures the ratio of L1 I-cache accesses missed to the total number of L1 I-cache accesses. This gives an indication of the effectiveness of the L1 I-cache.",
+        "ScaleUnit": "1per cache access",
+        "MetricGroup": "Miss_Ratio;L1I_Cache_Effectiveness"
+    },
+    {
+        "MetricName": "l1i_cache_mpki",
+        "MetricExpr": "1000 * (L1I_CACHE_REFILL / INST_RETIRED)",
+        "BriefDescription": "This metric measures the number of L1 I-cache accesses missed per thousand instructions executed.",
+        "ScaleUnit": "1MPKI",
+        "MetricGroup": "MPKI;L1I_Cache_Effectiveness"
+    },
+    {
+        "MetricName": "l1i_cache_rd_miss_ratio",
+        "MetricExpr": "l1i_demand_misses / l1i_demand_accesses",
+        "BriefDescription": "This metric measures the ratio of L1 I-cache Read accesses missed to the total number of L1 I-cache accesses. This gives an indication of the effectiveness of the L1 I-cache for demand instruction fetch traffic. Note that cache accesses in this cache are demand instruction fetch.",
+        "ScaleUnit": "1per cache access",
+        "MetricGroup": "L1D_Prefetcher_Effectiveness"
+    },
+    {
+        "MetricName": "l1i_demand_accesses",
+        "MetricExpr": "L1I_CACHE_RD",
+        "BriefDescription": "This metric measures the count of L1 I-cache accesses caused by an instruction fetch by the instruction stream of the program.",
+        "ScaleUnit": "1count",
+        "MetricGroup": "L1D_Prefetcher_Effectiveness"
+    },
+    {
+        "MetricName": "l1i_demand_misses",
+        "MetricExpr": "L1I_CACHE_REFILL_RD",
+        "BriefDescription": "This metric measures the count of L1 I-cache misses caused by an instruction fetch by the instruction stream of the program.",
+        "ScaleUnit": "1count",
+        "MetricGroup": "L1D_Prefetcher_Effectiveness"
+    },
+    {
+        "MetricName": "l1i_prf_accuracy",
+        "MetricExpr": "100 * (l1i_useful_prf / l1i_refilled_prf)",
+        "BriefDescription": "This metric measures the fraction of prefetched memory addresses that are used by the instruction stream.",
+        "ScaleUnit": "1percent of prefetch",
+        "MetricGroup": "L1D_Prefetcher_Effectiveness"
+    },
+    {
+        "MetricName": "l1i_prf_coverage",
+        "MetricExpr": "100 * (l1i_useful_prf / (l1i_demand_misses + l1i_refilled_prf))",
+        "BriefDescription": "This metric measures the baseline demand cache misses which the prefetcher brings into the cache.",
+        "ScaleUnit": "1percent of cache access",
+        "MetricGroup": "L1D_Prefetcher_Effectiveness"
+    },
+    {
+        "MetricName": "l1i_refilled_prf",
+        "MetricExpr": "L1I_CACHE_REFILL_HWPRF + L1I_CACHE_REFILL_PRFM",
+        "BriefDescription": "This metric measures the count of cache lines refilled by L1 instruction prefetcher (hardware prefetches or software preload) into L1 I-cache.",
+        "ScaleUnit": "1count",
+        "MetricGroup": "L1D_Prefetcher_Effectiveness"
+    },
+    {
+        "MetricName": "l1i_tlb_miss_ratio",
+        "MetricExpr": "L1I_TLB_REFILL / L1I_TLB",
+        "BriefDescription": "This metric measures the ratio of L1 Instruction TLB accesses missed to the total number of L1 Instruction TLB accesses. This gives an indication of the effectiveness of the L1 Instruction TLB.",
+        "ScaleUnit": "1per TLB access",
+        "MetricGroup": "Miss_Ratio;ITLB_Effectiveness"
+    },
+    {
+        "MetricName": "l1i_tlb_mpki",
+        "MetricExpr": "1000 * (L1I_TLB_REFILL / INST_RETIRED)",
+        "BriefDescription": "This metric measures the number of L1 Instruction TLB accesses missed per thousand instructions executed.",
+        "ScaleUnit": "1MPKI",
+        "MetricGroup": "MPKI;ITLB_Effectiveness"
+    },
+    {
+        "MetricName": "l1i_useful_prf",
+        "MetricExpr": "L1I_CACHE_HIT_RD_FPRF",
+        "BriefDescription": "This metric measures the count of cache lines refilled by L1 instruction prefetcher (hardware prefetches or software preload) into L1 I-cache which are further used by instruction stream of the program.",
+        "ScaleUnit": "1count",
+        "MetricGroup": "L1D_Prefetcher_Effectiveness"
+    },
+    {
+        "MetricName": "l2_cache_miss_ratio",
+        "MetricExpr": "L2D_CACHE_REFILL / L2D_CACHE",
+        "BriefDescription": "This metric measures the ratio of L2 cache accesses missed to the total number of L2 cache accesses. This gives an indication of the effectiveness of the L2 cache, which is a unified cache that stores both data and instruction.\nNote that cache accesses in this cache are either data memory access or instruction fetch as this is a unified cache.",
+        "ScaleUnit": "1per cache access",
+        "MetricGroup": "Miss_Ratio;L2_Cache_Effectiveness"
+    },
+    {
+        "MetricName": "l2_cache_mpki",
+        "MetricExpr": "1000 * (l2d_demand_misses / INST_RETIRED)",
+        "BriefDescription": "This metric measures the number of L2 unified cache accesses missed per thousand instructions executed.\nNote that cache accesses in this cache are either data memory access or instruction fetch as this is a unified cache.",
+        "ScaleUnit": "1MPKI",
+        "MetricGroup": "MPKI;L2_Cache_Effectiveness"
+    },
+    {
+        "MetricName": "l2_tlb_miss_ratio",
+        "MetricExpr": "L2D_TLB_REFILL / L2D_TLB",
+        "BriefDescription": "This metric measures the ratio of L2 unified TLB accesses missed to the total number of L2 unified TLB accesses.\nThis gives an indication of the effectiveness of the L2 TLB.",
+        "ScaleUnit": "1per TLB access",
+        "MetricGroup": "Miss_Ratio;ITLB_Effectiveness;DTLB_Effectiveness"
+    },
+    {
+        "MetricName": "l2_tlb_mpki",
+        "MetricExpr": "1000 * (L2D_TLB_REFILL / INST_RETIRED)",
+        "BriefDescription": "This metric measures the number of L2 unified TLB accesses missed per thousand instructions executed.",
+        "ScaleUnit": "1MPKI",
+        "MetricGroup": "MPKI;ITLB_Effectiveness;DTLB_Effectiveness"
+    },
+    {
+        "MetricName": "l2d_cache_rwl1prf_miss_ratio",
+        "MetricExpr": "l2d_demand_misses / l2d_demand_accesses",
+        "BriefDescription": "This metric measures the ratio of L2 D-cache Read accesses missed to the total number of L2 D-cache accesses.\nThis gives an indication of the effectiveness of the L2 D-cache for demand instruction fetch, Load, Store, or L1 prefetcher accesses traffic.",
+        "ScaleUnit": "1per cache access",
+        "MetricGroup": "L2_Prefetcher_Effectiveness"
+    },
+    {
+        "MetricName": "l2d_demand_accesses",
+        "MetricExpr": "L2D_CACHE_RD + L2D_CACHE_WR + L2D_CACHE_L1PRF",
+        "BriefDescription": "This metric measures the count of L2 D-cache accesses incurred on an instruction fetch, Load, Store, or L1 prefetcher accesses by the instruction stream of the program.",
+        "ScaleUnit": "1count",
+        "MetricGroup": "L2_Prefetcher_Effectiveness"
+    },
+    {
+        "MetricName": "l2d_demand_misses",
+        "MetricExpr": "L2D_CACHE_REFILL_RD + L2D_CACHE_REFILL_WR + L2D_CACHE_REFILL_L1PRF",
+        "BriefDescription": "This metric measures the count of L2 D-cache misses incurred on an instruction fetch, Load, Store, or L1 prefetcher accesses by the instruction stream of the program.",
+        "ScaleUnit": "1count",
+        "MetricGroup": "L2_Prefetcher_Effectiveness"
+    },
+    {
+        "MetricName": "l2d_prf_accuracy",
+        "MetricExpr": "100 * (l2d_useful_prf / l2d_refilled_prf)",
+        "BriefDescription": "This metric measures the fraction of prefetched memory addresses that are used by the instruction stream.",
+        "ScaleUnit": "1percent of prefetch",
+        "MetricGroup": "L2_Prefetcher_Effectiveness"
+    },
+    {
+        "MetricName": "l2d_prf_coverage",
+        "MetricExpr": "100 * (l2d_useful_prf / (l2d_demand_misses + l2d_refilled_prf))",
+        "BriefDescription": "This metric measures the baseline demand cache misses which the prefetcher brings into the cache.",
+        "ScaleUnit": "1percent of cache access",
+        "MetricGroup": "L2_Prefetcher_Effectiveness"
+    },
+    {
+        "MetricName": "l2d_refilled_prf",
+        "MetricExpr": "(L2D_CACHE_REFILL_PRF - L2D_CACHE_REFILL_L1PRF) + L2D_LFB_HIT_RWL1PRF_FHWPRF",
+        "BriefDescription": "This metric measures the count of cache lines refilled by L2 data prefetcher (hardware prefetches or software preload) into L2 D-cache.",
+        "ScaleUnit": "1count",
+        "MetricGroup": "L2_Prefetcher_Effectiveness"
+    },
+    {
+        "MetricName": "l2d_useful_prf",
+        "MetricExpr": "L2D_CACHE_HIT_RWL1PRF_FPRF + L2D_LFB_HIT_RWL1PRF_FHWPRF",
+        "BriefDescription": "This metric measures the count of cache lines refilled by L2 data prefetcher (hardware prefetches or software preload) into L2 D-cache which are further used by instruction fetch, Load, Store, or L1 prefetcher accesses from the instruction stream of the program.",
+        "ScaleUnit": "1count",
+        "MetricGroup": "L2_Prefetcher_Effectiveness"
+    },
+    {
+        "MetricName": "l3d_cache_rwl1prfl2prf_miss_ratio",
+        "MetricExpr": "l3d_demand_misses / l3d_demand_accesses",
+        "BriefDescription": "This metric measures the ratio of L3 D-cache Read accesses missed to the total number of L3 D-cache accesses. This gives an indication of the effectiveness of the L2 D-cache for demand instruction fetch, Load, Store, L1 prefetcher, or L2 prefetcher accesses traffic.",
+        "ScaleUnit": "1per cache access",
+        "MetricGroup": "L3_Prefetcher_Effectiveness"
+    },
+    {
+        "MetricName": "l3d_demand_accesses",
+        "MetricExpr": "L3D_CACHE_RWL1PRFL2PRF",
+        "BriefDescription": "This metric measures the count of L3 D-cache accesses incurred on an instruction fetch, Load, Store, L1 prefetcher, or L2 prefetcher accesses by the instruction stream of the program.",
+        "ScaleUnit": "1count",
+        "MetricGroup": "L3_Prefetcher_Effectiveness"
+    },
+    {
+        "MetricName": "l3d_demand_misses",
+        "MetricExpr": "L3D_CACHE_REFILL_RWL1PRFL2PRF",
+        "BriefDescription": "This metric measures the count of L3 D-cache misses incurred on an instruction fetch, Load, Store, L1 prefetcher, or L2 prefetcher accesses by the instruction stream of the program.",
+        "ScaleUnit": "1count",
+        "MetricGroup": "L3_Prefetcher_Effectiveness"
+    },
+    {
+        "MetricName": "l3d_prf_accuracy",
+        "MetricExpr": "100 * (l3d_useful_prf / l3d_refilled_prf)",
+        "BriefDescription": "This metric measures the fraction of prefetched memory addresses that are used by the instruction stream.",
+        "ScaleUnit": "1percent of prefetch",
+        "MetricGroup": "L3_Prefetcher_Effectiveness"
+    },
+    {
+        "MetricName": "l3d_prf_coverage",
+        "MetricExpr": "100 * (l3d_useful_prf / (l3d_demand_misses + l3d_refilled_prf))",
+        "BriefDescription": "This metric measures the baseline demand cache misses which the prefetcher brings into the cache.",
+        "ScaleUnit": "1percent of cache access",
+        "MetricGroup": "L3_Prefetcher_Effectiveness"
+    },
+    {
+        "MetricName": "l3d_refilled_prf",
+        "MetricExpr": "L3D_CACHE_REFILL_HWPRF + L3D_CACHE_REFILL_PRFM - L3D_CACHE_REFILL_L1PRF - L3D_CACHE_REFILL_L2PRF",
+        "BriefDescription": "This metric measures the count of cache lines refilled by L3 data prefetcher (hardware prefetches or software preload) into L3 D-cache.",
+        "ScaleUnit": "1count",
+        "MetricGroup": "L3_Prefetcher_Effectiveness"
+    },
+    {
+        "MetricName": "l3d_useful_prf",
+        "MetricExpr": "L3D_CACHE_HIT_RWL1PRFL2PRF_FPRF",
+        "BriefDescription": "This metric measures the count of cache lines refilled by L3 data prefetcher (hardware prefetches or software preload) into L3 D-cache which are further used by instruction fetch, Load, Store, L1 prefetcher, or L2 prefetcher accesses from the instruction stream of the program.",
+        "ScaleUnit": "1count",
+        "MetricGroup": "L3_Prefetcher_Effectiveness"
+    },
+    {
+        "MetricName": "ll_cache_read_hit_ratio",
+        "MetricExpr": "(LL_CACHE_RD - LL_CACHE_MISS_RD) / LL_CACHE_RD",
+        "BriefDescription": "This metric measures the ratio of last level cache Read accesses hit in the cache to the total number of last level cache accesses. This gives an indication of the effectiveness of the last level cache for Read traffic. Note that cache accesses in this cache are either data memory access or instruction fetch as this is a system level cache.",
+        "ScaleUnit": "1per cache access",
+        "MetricGroup": "LL_Cache_Effectiveness"
+    },
+    {
+        "MetricName": "ll_cache_read_miss_ratio",
+        "MetricExpr": "LL_CACHE_MISS_RD / LL_CACHE_RD",
+        "BriefDescription": "This metric measures the ratio of last level cache Read accesses missed to the total number of last level cache accesses. This gives an indication of the effectiveness of the last level cache for Read traffic. Note that cache accesses in this cache are either data memory access or instruction fetch as this is a system level cache.",
+        "ScaleUnit": "1per cache access",
+        "MetricGroup": "Miss_Ratio;LL_Cache_Effectiveness"
+    },
+    {
+        "MetricName": "ll_cache_read_mpki",
+        "MetricExpr": "1000 * (LL_CACHE_MISS_RD / INST_RETIRED)",
+        "BriefDescription": "This metric measures the number of last level cache Read accesses missed per thousand instructions executed.",
+        "ScaleUnit": "1MPKI",
+        "MetricGroup": "MPKI;LL_Cache_Effectiveness"
+    },
+    {
+        "MetricName": "load_average_latency",
+        "MetricExpr": "MEM_ACCESS_RD_PERCYC / MEM_ACCESS",
+        "BriefDescription": "This metric measures the average latency of Load operations in CPU cycles.",
+        "ScaleUnit": "1CPU cycles",
+        "MetricGroup": "Average_Latency"
+    },
+    {
+        "MetricName": "load_percentage",
+        "MetricExpr": "100 * (LD_SPEC / INST_SPEC)",
+        "BriefDescription": "This metric measures Load operations as a percentage of operations speculatively executed.",
+        "ScaleUnit": "1percent of operations",
+        "MetricGroup": "Operation_Mix"
+    },
+    {
+        "MetricName": "nonsve_fp_ops_per_cycle",
+        "MetricExpr": "FP_FIXED_OPS_SPEC / CPU_CYCLES",
+        "BriefDescription": "This metric measures floating point operations per cycle in any precision performed by an instruction that is not an SVE instruction. Operations are counted by computation and by vector lanes, fused computations such as multiply-add count as twice per vector lane for example.",
+        "ScaleUnit": "1operations per cycle",
+        "MetricGroup": "FP_Arithmetic_Intensity"
+    },
+    {
+        "MetricName": "retiring",
+        "MetricExpr": "100 * ((OP_RETIRED/OP_SPEC) * (1 - (STALL_SLOT/CPU_SLOT)))",
+        "BriefDescription": "This metric is the percentage of total slots that retired operations, which indicates cycles that were utilized efficiently.",
+        "ScaleUnit": "1percent of slots",
+        "MetricGroup": "TopdownL1"
+    },
+    {
+        "MetricName": "scalar_fp_percentage",
+        "MetricExpr": "100 * (VFP_SPEC / INST_SPEC)",
+        "BriefDescription": "This metric measures scalar floating point operations as a percentage of operations speculatively executed.",
+        "ScaleUnit": "1percent of operations",
+        "MetricGroup": "Operation_Mix"
+    },
+    {
+        "MetricName": "simd_percentage",
+        "MetricExpr": "100 * (ASE_SPEC / INST_SPEC)",
+        "BriefDescription": "This metric measures advanced SIMD operations as a percentage of total operations speculatively executed.",
+        "ScaleUnit": "1percent of operations",
+        "MetricGroup": "Operation_Mix"
+    },
+    {
+        "MetricName": "store_percentage",
+        "MetricExpr": "100 * (ST_SPEC / INST_SPEC)",
+        "BriefDescription": "This metric measures Store operations as a percentage of operations speculatively executed.",
+        "ScaleUnit": "1percent of operations",
+        "MetricGroup": "Operation_Mix"
+    },
+    {
+        "MetricName": "sve_all_percentage",
+        "MetricExpr": "100 * (SVE_INST_SPEC / INST_SPEC)",
+        "BriefDescription": "This metric measures scalable vector operations, including Loads and Stores, as a percentage of operations speculatively executed.",
+        "ScaleUnit": "1percent of operations",
+        "MetricGroup": "Operation_Mix"
+    },
+    {
+        "MetricName": "sve_fp_ops_per_cycle",
+        "MetricExpr": "FP_SCALE_OPS_SPEC / CPU_CYCLES",
+        "BriefDescription": "This metric measures floating point operations per cycle in any precision performed by SVE instructions. Operations are counted by computation and by vector lanes, fused computations such as multiply-add count as twice per vector lane for example.",
+        "ScaleUnit": "1operations per cycle",
+        "MetricGroup": "FP_Arithmetic_Intensity"
+    },
+    {
+        "MetricName": "sve_predicate_empty_percentage",
+        "MetricExpr": "100 * (SVE_PRED_EMPTY_SPEC / SVE_PRED_SPEC)",
+        "BriefDescription": "This metric measures scalable vector operations with no active predicates as a percentage of SVE predicated operations speculatively executed.",
+        "ScaleUnit": "1percent of SVE predicated operations",
+        "MetricGroup": "SVE_Effectiveness"
+    },
+    {
+        "MetricName": "sve_predicate_full_percentage",
+        "MetricExpr": "100 * (SVE_PRED_FULL_SPEC / SVE_PRED_SPEC)",
+        "BriefDescription": "This metric measures scalable vector operations with all active predicates as a percentage of SVE predicated operations speculatively executed.",
+        "ScaleUnit": "1percent of SVE predicated operations",
+        "MetricGroup": "SVE_Effectiveness"
+    },
+    {
+        "MetricName": "sve_predicate_partial_percentage",
+        "MetricExpr": "100 * (SVE_PRED_PARTIAL_SPEC / SVE_PRED_SPEC)",
+        "BriefDescription": "This metric measures scalable vector operations with at least one active predicates as a percentage of SVE predicated operations speculatively executed.",
+        "ScaleUnit": "1percent of SVE predicated operations",
+        "MetricGroup": "SVE_Effectiveness"
+    },
+    {
+        "MetricName": "sve_predicate_percentage",
+        "MetricExpr": "100 * (SVE_PRED_SPEC / INST_SPEC)",
+        "BriefDescription": "This metric measures scalable vector operations with predicates as a percentage of operations speculatively executed.",
+        "ScaleUnit": "1percent of operations",
+        "MetricGroup": "SVE_Effectiveness"
+    }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/misc.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/misc.json
new file mode 100644
index 000000000000..8ff87d844e52
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/misc.json
@@ -0,0 +1,642 @@
+[
+    {
+        "ArchStdEvent": "SW_INCR",
+        "PublicDescription": "This event counts software writes to the PMSWINC_EL0 (software PMU increment) register. The PMSWINC_EL0 register is a manually updated counter for use by application software.\nThis event could be used to measure any user program event, such as accesses to a particular data structure (by writing to the PMSWINC_EL0 register each time the data structure is accessed).\nTo use the PMSWINC_EL0 register and event, developers must insert instructions that write to the PMSWINC_EL0 register into the source code.\nSince the SW_INCR event records writes to the PMSWINC_EL0 register, there is no need to do a Read/Increment/Write sequence to the PMSWINC_EL0 register."
+    },
+    {
+        "ArchStdEvent": "TRB_WRAP",
+        "PublicDescription": "This event is generated each time the trace buffer current Write pointer is wrapped to the trace buffer base pointer."
+    },
+    {
+        "ArchStdEvent": "TRCEXTOUT0",
+        "PublicDescription": "Trace unit external output 0."
+    },
+    {
+        "ArchStdEvent": "TRCEXTOUT1",
+        "PublicDescription": "Trace unit external output 1."
+    },
+    {
+        "ArchStdEvent": "TRCEXTOUT2",
+        "PublicDescription": "Trace unit external output 2."
+    },
+    {
+        "ArchStdEvent": "TRCEXTOUT3",
+        "PublicDescription": "Trace unit external output 3."
+    },
+    {
+        "ArchStdEvent": "CTI_TRIGOUT4",
+        "PublicDescription": "Cross-trigger Interface output trigger 4."
+    },
+    {
+        "ArchStdEvent": "CTI_TRIGOUT5",
+        "PublicDescription": "Cross-trigger Interface output trigger 5."
+    },
+    {
+        "ArchStdEvent": "CTI_TRIGOUT6",
+        "PublicDescription": "Cross-trigger Interface output trigger 6."
+    },
+    {
+        "ArchStdEvent": "CTI_TRIGOUT7",
+        "PublicDescription": "Cross-trigger Interface output trigger 7."
+    },
+    {
+        "EventCode": "0x00e1",
+        "EventName": "L1I_PRFM_REQ_DROP",
+        "PublicDescription": "L1 I-cache software prefetch dropped."
+    },
+    {
+        "EventCode": "0x0100",
+        "EventName": "L1_PF_REFILL",
+        "PublicDescription": "L1 prefetch requests, refilled to L1 cache."
+    },
+    {
+        "EventCode": "0x0120",
+        "EventName": "FLUSH",
+        "PublicDescription": "This event counts both the CT flush and BX flush. The BR_MIS_PRED counts the BX flushes. So the FLUSH-BR_MIS_PRED gives the CT flushes."
+    },
+    {
+        "EventCode": "0x0121",
+        "EventName": "FLUSH_MEM",
+        "PublicDescription": "Flushes due to memory hazards. This only includes CT flushes."
+    },
+    {
+        "EventCode": "0x0122",
+        "EventName": "FLUSH_BAD_BRANCH",
+        "PublicDescription": "Flushes due to bad predicted branch. This only includes CT flushes."
+    },
+    {
+        "EventCode": "0x0123",
+        "EventName": "FLUSH_STDBYPASS",
+        "PublicDescription": "Flushes due to bad predecode. This only includes CT flushes."
+    },
+    {
+        "EventCode": "0x0124",
+        "EventName": "FLUSH_ISB",
+        "PublicDescription": "Flushes due to ISB or similar side-effects. This only includes CT flushes."
+    },
+    {
+        "EventCode": "0x0125",
+        "EventName": "FLUSH_OTHER",
+        "PublicDescription": "Flushes due to other hazards. This only includes CT flushes."
+    },
+    {
+        "EventCode": "0x0126",
+        "EventName": "STORE_STREAM",
+        "PublicDescription": "Stored lines in streaming no-Write-allocate mode."
+    },
+    {
+        "EventCode": "0x0127",
+        "EventName": "NUKE_RAR",
+        "PublicDescription": "Load/Store nuke due to Read-after-Read ordering hazard."
+    },
+    {
+        "EventCode": "0x0128",
+        "EventName": "NUKE_RAW",
+        "PublicDescription": "Load/Store nuke due to Read-after-Write ordering hazard."
+    },
+    {
+        "EventCode": "0x0129",
+        "EventName": "L1_PF_GEN_PAGE",
+        "PublicDescription": "Load/Store prefetch to L1 generated, Page mode."
+    },
+    {
+        "EventCode": "0x012a",
+        "EventName": "L1_PF_GEN_STRIDE",
+        "PublicDescription": "Load/Store prefetch to L1 generated, stride mode."
+    },
+    {
+        "EventCode": "0x012b",
+        "EventName": "L2_PF_GEN_LD",
+        "PublicDescription": "Load prefetch to L2 generated."
+    },
+    {
+        "EventCode": "0x012d",
+        "EventName": "LS_PF_TRAIN_TABLE_ALLOC",
+        "PublicDescription": "LS prefetch train table entry allocated."
+    },
+    {
+        "EventCode": "0x0130",
+        "EventName": "LS_PF_GEN_TABLE_ALLOC",
+        "PublicDescription": "This event counts the number of cycles with at least one table allocation, for L2 hardware prefetches (including the software PRFM instructions that are converted into hardware prefetches due to D-TLB miss).\nLS prefetch gen table allocation (for L2 prefetches)."
+    },
+    {
+        "EventCode": "0x0131",
+        "EventName": "LS_PF_GEN_TABLE_ALLOC_PF_PEND",
+        "PublicDescription": "This event counts the number of cycles in which at least one hardware prefetch is dropped due to the inability to identify a victim when the generation table is full. The hardware prefetch considered here includes the software PRFM that is converted into hardware prefetches due to D-TLB miss."
+    },
+    {
+        "EventCode": "0x0132",
+        "EventName": "TBW",
+        "PublicDescription": "Tablewalks."
+    },
+    {
+        "EventCode": "0x0134",
+        "EventName": "S1L2_HIT",
+        "PublicDescription": "Translation cache hit on S1L2 walk cache entry."
+    },
+    {
+        "EventCode": "0x0135",
+        "EventName": "S1L1_HIT",
+        "PublicDescription": "Translation cache hit on S1L1 walk cache entry."
+    },
+    {
+        "EventCode": "0x0136",
+        "EventName": "S1L0_HIT",
+        "PublicDescription": "Translation cache hit on S1L0 walk cache entry."
+    },
+    {
+        "EventCode": "0x0137",
+        "EventName": "S2L2_HIT",
+        "PublicDescription": "Translation cache hit for S2L2 IPA walk cache entry."
+    },
+    {
+        "EventCode": "0x0138",
+        "EventName": "IPA_REQ",
+        "PublicDescription": "Translation cache lookups for IPA to PA entries."
+    },
+    {
+        "EventCode": "0x0139",
+        "EventName": "IPA_REFILL",
+        "PublicDescription": "Translation cache refills for IPA to PA entries."
+    },
+    {
+        "EventCode": "0x013a",
+        "EventName": "S1_FLT",
+        "PublicDescription": "Stage1 tablewalk fault."
+    },
+    {
+        "EventCode": "0x013b",
+        "EventName": "S2_FLT",
+        "PublicDescription": "Stage2 tablewalk fault."
+    },
+    {
+        "EventCode": "0x013c",
+        "EventName": "COLT_REFILL",
+        "PublicDescription": "Aggregated page refill."
+    },
+    {
+        "EventCode": "0x0145",
+        "EventName": "L1_PF_HIT",
+        "PublicDescription": "L1 prefetch requests, hitting in L1 cache."
+    },
+    {
+        "EventCode": "0x0146",
+        "EventName": "L1_PF",
+        "PublicDescription": "L1 prefetch requests."
+    },
+    {
+        "EventCode": "0x0147",
+        "EventName": "CACHE_LS_REFILL",
+        "PublicDescription": "L2 D-cache refill, Load/Store."
+    },
+    {
+        "EventCode": "0x0148",
+        "EventName": "CACHE_PF",
+        "PublicDescription": "L2 prefetch requests."
+    },
+    {
+        "EventCode": "0x0149",
+        "EventName": "CACHE_PF_HIT",
+        "PublicDescription": "L2 prefetch requests, hitting in L2 cache."
+    },
+    {
+        "EventCode": "0x0150",
+        "EventName": "UNUSED_PF",
+        "PublicDescription": "L2 unused prefetch."
+    },
+    {
+        "EventCode": "0x0151",
+        "EventName": "PFT_SENT",
+        "PublicDescription": "L2 prefetch TGT sent.\nNote that PFT_SENT != PFT_USEFUL + PFT_DROP. There may be PFT_SENT for which the accesses resulted in a SLC hit."
+    },
+    {
+        "EventCode": "0x0152",
+        "EventName": "PFT_USEFUL",
+        "PublicDescription": "L2 prefetch TGT useful."
+    },
+    {
+        "EventCode": "0x0153",
+        "EventName": "PFT_DROP",
+        "PublicDescription": "L2 prefetch TGT dropped."
+    },
+    {
+        "EventCode": "0x0162",
+        "EventName": "LRQ_FULL",
+        "PublicDescription": "This event counts the number of cycles the LRQ is full."
+    },
+    {
+        "EventCode": "0x0163",
+        "EventName": "FETCH_FQ_EMPTY",
+        "PublicDescription": "Fetch Queue empty cycles."
+    },
+    {
+        "EventCode": "0x0164",
+        "EventName": "FPG2",
+        "PublicDescription": "Forward progress guarantee. Medium range livelock triggered."
+    },
+    {
+        "EventCode": "0x0165",
+        "EventName": "FPG",
+        "PublicDescription": "Forward progress guarantee. Tofu global livelock buster is triggered."
+    },
+    {
+        "EventCode": "0x0172",
+        "EventName": "DEADBLOCK",
+        "PublicDescription": "Write-back evictions converted to dataless EVICT.\nThe victim line is deemed deadblock if the likeliness of a reuse is low. The Core uses dataless evict to evict a deadblock; and it uses an evict with data to evict an L2 line that is not a deadblock."
+    },
+    {
+        "EventCode": "0x0173",
+        "EventName": "PF_PRQ_ALLOC_PF_PEND",
+        "PublicDescription": "L1 prefetch prq allocation (replacing pending)."
+    },
+    {
+        "EventCode": "0x0178",
+        "EventName": "FETCH_ICACHE_INSTR",
+        "PublicDescription": "Instructions fetched from I-cache."
+    },
+    {
+        "EventCode": "0x017b",
+        "EventName": "NEAR_CAS",
+        "PublicDescription": "Near atomics: compare and swap."
+    },
+    {
+        "EventCode": "0x017c",
+        "EventName": "NEAR_CAS_PASS",
+        "PublicDescription": "Near atomics: compare and swap pass."
+    },
+    {
+        "EventCode": "0x017d",
+        "EventName": "FAR_CAS",
+        "PublicDescription": "Far atomics: compare and swap."
+    },
+    {
+        "EventCode": "0x0186",
+        "EventName": "L2_BTB_RELOAD_MAIN_BTB",
+        "PublicDescription": "Number of completed L1 BTB update initiated by L2 BTB hit which swap branch information between L1 BTB and L2 BTB."
+    },
+    {
+        "EventCode": "0x018f",
+        "EventName": "L1_PF_GEN_MCMC",
+        "PublicDescription": "Load/Store prefetch to L1 generated, MCMC."
+    },
+    {
+        "EventCode": "0x0190",
+        "EventName": "PF_MODE_0_CYCLES",
+        "PublicDescription": "Number of cycles in which the hardware prefetcher is in the most aggressive mode."
+    },
+    {
+        "EventCode": "0x0191",
+        "EventName": "PF_MODE_1_CYCLES",
+        "PublicDescription": "Number of cycles in which the hardware prefetcher is in the more aggressive mode."
+    },
+    {
+        "EventCode": "0x0192",
+        "EventName": "PF_MODE_2_CYCLES",
+        "PublicDescription": "Number of cycles in which the hardware prefetcher is in the less aggressive mode."
+    },
+    {
+        "EventCode": "0x0193",
+        "EventName": "PF_MODE_3_CYCLES",
+        "PublicDescription": "Number of cycles in which the hardware prefetcher is in the most conservative mode."
+    },
+    {
+        "EventCode": "0x0194",
+        "EventName": "TXREQ_LIMIT_MAX_CYCLES",
+        "PublicDescription": "Number of cycles in which the dynamic TXREQ limit is the L2_TQ_SIZE."
+    },
+    {
+        "EventCode": "0x0195",
+        "EventName": "TXREQ_LIMIT_3QUARTER_CYCLES",
+        "PublicDescription": "Number of cycles in which the dynamic TXREQ limit is between 3/4 of the L2_TQ_SIZE and the L2_TQ_SIZE-1."
+    },
+    {
+        "EventCode": "0x0196",
+        "EventName": "TXREQ_LIMIT_HALF_CYCLES",
+        "PublicDescription": "Number of cycles in which the dynamic TXREQ limit is between 1/2 of the L2_TQ_SIZE and 3/4 of the L2_TQ_SIZE."
+    },
+    {
+        "EventCode": "0x0197",
+        "EventName": "TXREQ_LIMIT_1QUARTER_CYCLES",
+        "PublicDescription": "Number of cycles in which the dynamic TXREQ limit is between 1/4 of the L2_TQ_SIZE and 1/2 of the L2_TQ_SIZE."
+    },
+    {
+        "EventCode": "0x019d",
+        "EventName": "PREFETCH_LATE_CMC",
+        "PublicDescription": "LS/readclean or LS/readunique lookup hit on TQ entry allocated by CMC prefetch request."
+    },
+    {
+        "EventCode": "0x019e",
+        "EventName": "PREFETCH_LATE_BO",
+        "PublicDescription": "LS/readclean or LS/readunique lookup hit on TQ entry allocated by BO prefetch request."
+    },
+    {
+        "EventCode": "0x019f",
+        "EventName": "PREFETCH_LATE_STRIDE",
+        "PublicDescription": "LS/readclean or LS/readunique lookup hit on TQ entry allocated by STRIDE prefetch request."
+    },
+    {
+        "EventCode": "0x01a0",
+        "EventName": "PREFETCH_LATE_SPATIAL",
+        "PublicDescription": "LS/readclean or LS/readunique lookup hit on TQ entry allocated by SPATIAL prefetch request."
+    },
+    {
+        "EventCode": "0x01a2",
+        "EventName": "PREFETCH_LATE_TBW",
+        "PublicDescription": "LS/readclean or LS/readunique lookup hit on TQ entry allocated by TBW prefetch request."
+    },
+    {
+        "EventCode": "0x01a3",
+        "EventName": "PREFETCH_LATE_PAGE",
+        "PublicDescription": "LS/readclean or LS/readunique lookup hit on TQ entry allocated by PAGE prefetch request."
+    },
+    {
+        "EventCode": "0x01a4",
+        "EventName": "PREFETCH_LATE_GSMS",
+        "PublicDescription": "LS/readclean or LS/readunique lookup hit on TQ entry allocated by GSMS prefetch request."
+    },
+    {
+        "EventCode": "0x01a5",
+        "EventName": "PREFETCH_LATE_SIP_CONS",
+        "PublicDescription": "LS/readclean or LS/readunique lookup hit on TQ entry allocated by SIP_CONS prefetch request."
+    },
+    {
+        "EventCode": "0x01a6",
+        "EventName": "PREFETCH_REFILL_CMC",
+        "PublicDescription": "PF/prefetch or PF/readclean request from CMC pf engine filled the L2 cache."
+    },
+    {
+        "EventCode": "0x01a7",
+        "EventName": "PREFETCH_REFILL_BO",
+        "PublicDescription": "PF/prefetch or PF/readclean request from BO pf engine filled the L2 cache."
+    },
+    {
+        "EventCode": "0x01a8",
+        "EventName": "PREFETCH_REFILL_STRIDE",
+        "PublicDescription": "PF/prefetch or PF/readclean request from STRIDE pf engine filled the L2 cache."
+    },
+    {
+        "EventCode": "0x01a9",
+        "EventName": "PREFETCH_REFILL_SPATIAL",
+        "PublicDescription": "PF/prefetch or PF/readclean request from SPATIAL pf engine filled the L2 cache."
+    },
+    {
+        "EventCode": "0x01ab",
+        "EventName": "PREFETCH_REFILL_TBW",
+        "PublicDescription": "PF/prefetch or PF/readclean request from TBW pf engine filled the L2 cache."
+    },
+    {
+        "EventCode": "0x01ac",
+        "EventName": "PREFETCH_REFILL_PAGE",
+        "PublicDescription": "PF/prefetch or PF/readclean request from PAGE pf engine filled the L2 cache."
+    },
+    {
+        "EventCode": "0x01ad",
+        "EventName": "PREFETCH_REFILL_GSMS",
+        "PublicDescription": "PF/prefetch or PF/readclean request from GSMS pf engine filled the L2 cache."
+    },
+    {
+        "EventCode": "0x01ae",
+        "EventName": "PREFETCH_REFILL_SIP_CONS",
+        "PublicDescription": "PF/prefetch or PF/readclean request from SIP_CONS pf engine filled the L2 cache."
+    },
+    {
+        "EventCode": "0x01af",
+        "EventName": "CACHE_HIT_LINE_PF_CMC",
+        "PublicDescription": "LS/readclean or LS/readunique lookup hit in L2 cache on line filled by CMC prefetch request."
+    },
+    {
+        "EventCode": "0x01b0",
+        "EventName": "CACHE_HIT_LINE_PF_BO",
+        "PublicDescription": "LS/readclean or LS/readunique lookup hit in L2 cache on line filled by BO prefetch request."
+    },
+    {
+        "EventCode": "0x01b1",
+        "EventName": "CACHE_HIT_LINE_PF_STRIDE",
+        "PublicDescription": "LS/readclean or LS/readunique lookup hit in L2 cache on line filled by STRIDE prefetch request."
+    },
+    {
+        "EventCode": "0x01b2",
+        "EventName": "CACHE_HIT_LINE_PF_SPATIAL",
+        "PublicDescription": "LS/readclean or LS/readunique lookup hit in L2 cache on line filled by SPATIAL prefetch request."
+    },
+    {
+        "EventCode": "0x01b4",
+        "EventName": "CACHE_HIT_LINE_PF_TBW",
+        "PublicDescription": "LS/readclean or LS/readunique lookup hit in L2 cache on line filled by TBW prefetch request."
+    },
+    {
+        "EventCode": "0x01b5",
+        "EventName": "CACHE_HIT_LINE_PF_PAGE",
+        "PublicDescription": "LS/readclean or LS/readunique lookup hit in L2 cache on line filled by PAGE prefetch request."
+    },
+    {
+        "EventCode": "0x01b6",
+        "EventName": "CACHE_HIT_LINE_PF_GSMS",
+        "PublicDescription": "LS/readclean or LS/readunique lookup hit in L2 cache on line filled by GSMS prefetch request."
+    },
+    {
+        "EventCode": "0x01b7",
+        "EventName": "CACHE_HIT_LINE_PF_SIP_CONS",
+        "PublicDescription": "LS/readclean or LS/readunique lookup hit in L2 cache on line filled by SIP_CONS prefetch request."
+    },
+    {
+        "EventCode": "0x01ba",
+        "EventName": "PREFETCH_LATE_STORE_ISSUE",
+        "PublicDescription": "This event counts the number of demand requests that matches a Store-issue prefetcher's pending refill request. These are called late prefetch requests and are still counted as useful prefetcher requests for the sake of accuracy and coverage measurements."
+    },
+    {
+        "EventCode": "0x01bb",
+        "EventName": "PREFETCH_LATE_STORE_STRIDE",
+        "PublicDescription": "This event counts the number of demand requests that matches a Store-stride prefetcher's pending refill request. These are called late prefetch requests and are still counted as useful prefetcher requests for the sake of accuracy and coverage measurements."
+    },
+    {
+        "EventCode": "0x01bc",
+        "EventName": "PREFETCH_LATE_PC_OFFSET",
+        "PublicDescription": "This event counts the number of demand requests that matches a PC-offset prefetcher's pending refill request. These are called late prefetch requests and are still counted as useful prefetcher requests for the sake of accuracy and coverage measurements."
+    },
+    {
+        "EventCode": "0x01bd",
+        "EventName": "PREFETCH_LATE_IFUPF",
+        "PublicDescription": "This event counts the number of demand requests that matches a IFU prefetcher's pending refill request. These are called late prefetch requests and are still counted as useful prefetcher requests for the sake of accuracy and coverage measurements."
+    },
+    {
+        "EventCode": "0x01be",
+        "EventName": "PREFETCH_REFILL_STORE_ISSUE",
+        "PublicDescription": "This event counts the number of cache refills due to Store-Issue prefetcher."
+    },
+    {
+        "EventCode": "0x01bf",
+        "EventName": "PREFETCH_REFILL_STORE_STRIDE",
+        "PublicDescription": "This event counts the number of cache refills due to Store-stride prefetcher."
+    },
+    {
+        "EventCode": "0x01c0",
+        "EventName": "PREFETCH_REFILL_PC_OFFSET",
+        "PublicDescription": "This event counts the number of cache refills due to PC-offset prefetcher."
+    },
+    {
+        "EventCode": "0x01c1",
+        "EventName": "PREFETCH_REFILL_IFUPF",
+        "PublicDescription": "This event counts the number of cache refills due to IFU prefetcher."
+    },
+    {
+        "EventCode": "0x01c2",
+        "EventName": "CACHE_HIT_LINE_PF_STORE_ISSUE",
+        "PublicDescription": "This event counts the number of first hit to a cache line filled by Store-issue prefetcher."
+    },
+    {
+        "EventCode": "0x01c3",
+        "EventName": "CACHE_HIT_LINE_PF_STORE_STRIDE",
+        "PublicDescription": "This event counts the number of first hit to a cache line filled by Store-stride prefetcher."
+    },
+    {
+        "EventCode": "0x01c4",
+        "EventName": "CACHE_HIT_LINE_PF_PC_OFFSET",
+        "PublicDescription": "This event counts the number of first hit to a cache line filled by PC-offset prefetcher."
+    },
+    {
+        "EventCode": "0x01c5",
+        "EventName": "CACHE_HIT_LINE_PF_IFUPF",
+        "PublicDescription": "This event counts the number of first hit to a cache line filled by IFU prefetcher."
+    },
+    {
+        "EventCode": "0x01c6",
+        "EventName": "L2_PF_GEN_ST_ISSUE",
+        "PublicDescription": "Store-issue prefetch to L2 generated."
+    },
+    {
+        "EventCode": "0x01c7",
+        "EventName": "L2_PF_GEN_ST_STRIDE",
+        "PublicDescription": "Store-stride prefetch to L2 generated"
+    },
+    {
+        "EventCode": "0x01cb",
+        "EventName": "L2_TQ_OUTSTANDING",
+        "PublicDescription": "Outstanding tracker count, per cycle.\nThis event increments by the number of valid entries pertaining to this thread in the L2TQ, in each cycle.\nThis event can be used to calculate the occupancy of L2TQ by dividing this by the CPU_CYCLES event. The L2TQ queue tracks the outstanding Read, Write and Snoop transactions. The Read transaction and the Write transaction entries are attributable to PE, whereas the Snoop transactions are not always attributable to PE."
+    },
+    {
+        "EventCode": "0x01cc",
+        "EventName": "TXREQ_LIMIT_COUNT_CYCLES",
+        "PublicDescription": "This event increments by the dynamic TXREQ value, in each cycle.\nThis is a companion event of TXREQ_LIMIT_MAX_CYCLES, TXREQ_LIMIT_3QUARTER_CYCLES, TXREQ_LIMIT_HALF_CYCLES, and TXREQ_LIMIT_1QUARTER_CYCLES."
+    },
+    {
+        "EventCode": "0x01ce",
+        "EventName": "L3DPRFM_TO_L2PRQ_CONVERTED",
+        "PublicDescription": "This event counts the number of Converted-L3D-PRFMs. These are indeed L3D PRFM and activities around these PRFM are counted by the L3D_CACHE_PRFM, L3D_CACHE_REFILL_PRFM and L3D_CACHE_REFILL Events."
+    },
+    {
+        "EventCode": "0x01d2",
+        "EventName": "DVM_TLBI_RCVD",
+        "PublicDescription": "This event counts the number of TLBI DVM message received over CHI interface, for *this* Core."
+    },
+    {
+        "EventCode": "0x01d6",
+        "EventName": "DSB_COMMITING_LOCAL_TLBI",
+        "PublicDescription": "This event counts the number of DSB that are retired and committed at least one local TLBI instruction. This event increments no more than once (in a cycle) even if the DSB commits multiple local TLBI instruction."
+    },
+    {
+        "EventCode": "0x01d7",
+        "EventName": "DSB_COMMITING_BROADCAST_TLBI",
+        "PublicDescription": "This event counts the number of DSB that are retired and committed at least one broadcast TLBI instruction. This event increments no more than once (in a cycle) even if the DSB commits multiple broadcast TLBI instruction."
+    },
+    {
+        "EventCode": "0x01eb",
+        "EventName": "L1DPRFM_L2DPRFM_TO_L2PRQ_CONVERTED",
+        "PublicDescription": "This event counts the number of Converted-L1D-PRFMs and Converted-L2D-PRFM.\nActivities involving the Converted-L1D-PRFM are counted by the L1D_CACHE_PRFM. However they are *not* counted by the L1D_CACHE_REFILL_PRFM, and L1D_CACHE_REFILL, as these Converted-L1D-PRFM are treated as L2 D hardware prefetches. Activities around the Converted-L1D-PRFMs and Converted-L2D-PRFMs are counted by the L2D_CACHE_PRFM, L2D_CACHE_REFILL_PRFM and L2D_CACHE_REFILL Events."
+    },
+    {
+        "EventCode": "0x01ec",
+        "EventName": "PREFETCH_LATE_CONVERTED_PRFM",
+        "PublicDescription": "This event counts the number of demand requests that matches a Converted-L1D-PRFM or Converted-L2D-PRFM pending refill request at L2 D-cache. These are called late prefetch requests and are still counted as useful prefetcher requests for the sake of accuracy and coverage measurements.\nNote that this event is not counted by the L2D_CACHE_HIT_RWL1PRF_LATE_HWPRF, though the Converted-L1D-PRFM or Converted-L2D-PRFM are replayed by the L2PRQ."
+    },
+    {
+        "EventCode": "0x01ed",
+        "EventName": "PREFETCH_REFILL_CONVERTED_PRFM",
+        "PublicDescription": "This event counts the number of L2 D-cache refills due to Converted-L1D-PRFM or Converted-L2D-PRFM.\nNote : L2D_CACHE_REFILL_PRFM is inclusive of PREFETCH_REFILL_PRFM_CONVERTED, where both the PREFETCH_REFILL_PRFM_CONVERTED and the L2D_CACHE_REFILL_PRFM increment when L2 D-cache refills due to Converted-L1D-PRFM or Converted-L2D-PRFM."
+    },
+    {
+        "EventCode": "0x01ee",
+        "EventName": "CACHE_HIT_LINE_PF_CONVERTED_PRFM",
+        "PublicDescription": "This event counts the number of first hit to a cache line filled by Converted-L1D-PRFM or Converted-L2D-PRFM.\nNote that L2D_CACHE_HIT_RWL1PRF_FPRFM is inclusive of CACHE_HIT_LINE_PF_CONVERTED_PRFM, where both the CACHE_HIT_LINE_PF_CONVERTED_PRFM and the L2D_CACHE_HIT_RWL1PRF_FPRFM increment on a first hit to L2 D-cache filled by Converted-L1D-PRFM or Converted-L2D-PRFM."
+    },
+    {
+        "EventCode": "0x01f0",
+        "EventName": "TMS_ST_TO_SMT_LATENCY",
+        "PublicDescription": "This event counts the number of CPU cycles spent on TMS for ST-to-SMT switch.\nThis event is counted by both the threads - This event in both threads increment during TMS for ST-to-SMT switch."
+    },
+    {
+        "EventCode": "0x01f1",
+        "EventName": "TMS_SMT_TO_ST_LATENCY",
+        "PublicDescription": "This event counts the number of CPU cycles spent on TMS for SMT-to-ST switch. The count also includes the CPU cycles spend due to an aborted SMT-to-ST TMS attempt.\nThis event is counted only by the thread that is not in WFI."
+    },
+    {
+        "EventCode": "0x01f2",
+        "EventName": "TMS_ST_TO_SMT_COUNT",
+        "PublicDescription": "This event counts the number of completed TMS from ST-to-SMT.\nThis event is counted only by the active thread (the one that is not in WFI).\nNote: When an active thread enters the Debug state in ST-Full resource mode, it is switched to SMT mode. This is because the inactive thread cannot wake up while the other thread remains in the Debug state. To prEvent this issue, threads operating in ST-Full resource mode are transitioned to SMT mode upon entering Debug state. This event count will also reflect such switches from ST to SMT mode.\n(Also see the (NV_CPUACTLR14_EL1.chka_prEvent_st_tx_to_smt_when_tx_in_debug_state bit to disable this behavior.)"
+    },
+    {
+        "EventCode": "0x01f3",
+        "EventName": "TMS_SMT_TO_ST_COUNT",
+        "PublicDescription": "This event counts the number of completed TMS from SMT-to-ST.\nThis event is counted only by the thread that is not in WFI."
+    },
+    {
+        "EventCode": "0x01f4",
+        "EventName": "TMS_SMT_TO_ST_COUNT_ABRT",
+        "PublicDescription": "This event counts the number of aborted TMS from SMT-to-ST.\nThis event is counted only by the thread that is not in WFI."
+    },
+    {
+        "EventCode": "0x0202",
+        "EventName": "L0I_CACHE_RD",
+        "PublicDescription": "This event counts the number of predict blocks serviced out of L0 I-cache.\nNote: The L0 I-cache performs at most 4 L0 I look-up in a cycle. Two of which are to service PB from L0 I. And the other two to refill L0 I-cache from L1 I. This event count only the L0 I-cache lookup pertaining to servicing the PB from L0 I."
+    },
+    {
+        "EventCode": "0x0203",
+        "EventName": "L0I_CACHE_REFILL",
+        "PublicDescription": "This event counts the number of L0I cache refill from L1 I-cache."
+    },
+    {
+        "EventCode": "0x0207",
+        "EventName": "INTR_LATENCY",
+        "PublicDescription": "This event counts the number of cycles elapsed between when an Interrupt is recognized (after masking) to when a uop associated with the first instruction in the destination exception level is allocated. If there is some other flush condition that pre-empts the Interrupt, then the cycles counted terminates early at the first instruction executed after that flush. In the event of dropped Interrupts (when an Interrupt is deasserted before it is taken), this counter measures the number of cycles that elapse from the moment an Interrupt is recognized (post-masking) until the Interrupt is dropped or deasserted.\nNote that\n* IESB(Implicit Error Synchronization Barrier) is an internal mop, so the latency of an implicit IESB mop executed before the Interrupt taken is included in the Interrupt latency count.\n* Nukes or TMS sequence within the window are also counted by the Interrupt latency Event.\n* A SMT to ST TMS will be aborted on detecting the wake condition for the WFI thread. The Interrupt latency count includes any additional penalty for an aborted TMS."
+    },
+    {
+        "EventCode": "0x021c",
+        "EventName": "CWT_ALLOC_ENTRY",
+        "PublicDescription": "Cache Way Tracker Allocate entry."
+    },
+    {
+        "EventCode": "0x021d",
+        "EventName": "CWT_ALLOC_LINE",
+        "PublicDescription": "Cache Way Tracker Allocate line."
+    },
+    {
+        "EventCode": "0x021e",
+        "EventName": "CWT_HIT",
+        "PublicDescription": "Cache Way Tracker hit."
+    },
+    {
+        "EventCode": "0x021f",
+        "EventName": "CWT_HIT_TAG",
+        "PublicDescription": "Cache Way Tracker hit when ITAG lookup suppressed."
+    },
+    {
+        "EventCode": "0x0220",
+        "EventName": "CWT_REPLAY_TAG",
+        "PublicDescription": "Cache Way Tracker causes ITAG replay due to miss when ITAG lookup suppressed."
+    },
+    {
+        "EventCode": "0x0250",
+        "EventName": "GPT_REQ",
+        "PublicDescription": "GPT lookup."
+    },
+    {
+        "EventCode": "0x0251",
+        "EventName": "GPT_WC_HIT",
+        "PublicDescription": "GPT lookup hit in Walk cache."
+    },
+    {
+        "EventCode": "0x0252",
+        "EventName": "GPT_PG_HIT",
+        "PublicDescription": "GPT lookup hit in TLB."
+    }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/retired.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/retired.json
new file mode 100644
index 000000000000..34c7eefa66b0
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/retired.json
@@ -0,0 +1,94 @@
+[
+    {
+        "ArchStdEvent": "INST_RETIRED",
+        "PublicDescription": "This event counts instructions that have been architecturally executed."
+    },
+    {
+        "ArchStdEvent": "CID_WRITE_RETIRED",
+        "PublicDescription": "This event counts architecturally executed writes to the CONTEXTIDR_EL1 register, which usually contains the kernel PID and can be output with hardware trace."
+    },
+    {
+        "ArchStdEvent": "BR_IMMED_RETIRED",
+        "PublicDescription": "This event counts architecturally executed direct branches."
+    },
+    {
+        "ArchStdEvent": "BR_RETURN_RETIRED",
+        "PublicDescription": "This event counts architecturally executed procedure returns."
+    },
+    {
+        "ArchStdEvent": "TTBR_WRITE_RETIRED",
+        "PublicDescription": "This event counts architectural writes to TTBR0/1_EL1. If virtualization host extensions are enabled (by setting the HCR_EL2.E2H bit to 1), then accesses to TTBR0/1_EL1 that are redirected to TTBR0/1_EL2, or accesses to TTBR0/1_EL12, are counted. TTBRn registers are typically updated when the kernel is swapping user-space threads or applications."
+    },
+    {
+        "ArchStdEvent": "BR_RETIRED",
+        "PublicDescription": "This event counts architecturally executed branches, whether the branch is taken or not. Instructions that explicitly write to the PC are also counted. Note that exception generating instructions, exception return instructions, and context synchronization instructions are not counted."
+    },
+    {
+        "ArchStdEvent": "BR_MIS_PRED_RETIRED",
+        "PublicDescription": "This event counts branches counted by BR_RETIRED which were mispredicted and caused a pipeline flush."
+    },
+    {
+        "ArchStdEvent": "OP_RETIRED",
+        "PublicDescription": "This event counts micro-operations that are architecturally executed. This is a count of number of micro-operations retired from the commit queue in a single cycle."
+    },
+    {
+        "ArchStdEvent": "BR_INDNR_TAKEN_RETIRED",
+        "PublicDescription": "This event counts architecturally executed indirect branches excluding procedure returns that were taken."
+    },
+    {
+        "ArchStdEvent": "BR_IMMED_PRED_RETIRED",
+        "PublicDescription": "This event counts architecturally executed direct branches that were correctly predicted."
+    },
+    {
+        "ArchStdEvent": "BR_IMMED_MIS_PRED_RETIRED",
+        "PublicDescription": "This event counts architecturally executed direct branches that were mispredicted and caused a pipeline flush."
+    },
+    {
+        "ArchStdEvent": "BR_IND_PRED_RETIRED",
+        "PublicDescription": "This event counts architecturally executed indirect branches including procedure returns that were correctly predicted."
+    },
+    {
+        "ArchStdEvent": "BR_IND_MIS_PRED_RETIRED",
+        "PublicDescription": "This event counts architecturally executed indirect branches including procedure returns that were mispredicted and caused a pipeline flush."
+    },
+    {
+        "ArchStdEvent": "BR_RETURN_PRED_RETIRED",
+        "PublicDescription": "This event counts architecturally executed procedure returns that were correctly predicted."
+    },
+    {
+        "ArchStdEvent": "BR_RETURN_MIS_PRED_RETIRED",
+        "PublicDescription": "This event counts architecturally executed procedure returns that were mispredicted and caused a pipeline flush."
+    },
+    {
+        "ArchStdEvent": "BR_INDNR_PRED_RETIRED",
+        "PublicDescription": "This event counts architecturally executed indirect branches excluding procedure returns that were correctly predicted."
+    },
+    {
+        "ArchStdEvent": "BR_INDNR_MIS_PRED_RETIRED",
+        "PublicDescription": "This event counts architecturally executed indirect branches excluding procedure returns that were mispredicted and caused a pipeline flush."
+    },
+    {
+        "ArchStdEvent": "BR_TAKEN_PRED_RETIRED",
+        "PublicDescription": "This event counts architecturally executed branches that were taken and were correctly predicted."
+    },
+    {
+        "ArchStdEvent": "BR_TAKEN_MIS_PRED_RETIRED",
+        "PublicDescription": "This event counts architecturally executed branches that were taken and were mispredicted causing a pipeline flush."
+    },
+    {
+        "ArchStdEvent": "BR_SKIP_PRED_RETIRED",
+        "PublicDescription": "This event counts architecturally executed branches that were not taken and were correctly predicted."
+    },
+    {
+        "ArchStdEvent": "BR_SKIP_MIS_PRED_RETIRED",
+        "PublicDescription": "This event counts architecturally executed branches that were not taken and were mispredicted causing a pipeline flush."
+    },
+    {
+        "ArchStdEvent": "BR_PRED_RETIRED",
+        "PublicDescription": "This event counts branch instructions counted by BR_RETIRED which were correctly predicted."
+    },
+    {
+        "ArchStdEvent": "BR_IND_RETIRED",
+        "PublicDescription": "This event counts architecturally executed indirect branches including procedure returns."
+    }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/spe.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/spe.json
new file mode 100644
index 000000000000..00d0c5051a48
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/spe.json
@@ -0,0 +1,42 @@
+[
+    {
+        "ArchStdEvent": "SAMPLE_POP",
+        "PublicDescription": "This event counts statistical profiling sample population, the count of all operations that could be sampled but may or may not be chosen for sampling."
+    },
+    {
+        "ArchStdEvent": "SAMPLE_FEED",
+        "PublicDescription": "This event counts statistical profiling samples taken for sampling."
+    },
+    {
+        "ArchStdEvent": "SAMPLE_FILTRATE",
+        "PublicDescription": "This event counts statistical profiling samples taken which are not removed by filtering."
+    },
+    {
+        "ArchStdEvent": "SAMPLE_COLLISION",
+        "PublicDescription": "This event counts statistical profiling samples that have collided with a previous sample and so therefore not taken."
+    },
+    {
+        "ArchStdEvent": "SAMPLE_FEED_BR",
+        "PublicDescription": "This event counts statistical profiling samples taken which are branches."
+    },
+    {
+        "ArchStdEvent": "SAMPLE_FEED_LD",
+        "PublicDescription": "This event counts statistical profiling samples taken which are Loads or Load atomic operations."
+    },
+    {
+        "ArchStdEvent": "SAMPLE_FEED_ST",
+        "PublicDescription": "This event counts statistical profiling samples taken which are Stores or Store atomic operations."
+    },
+    {
+        "ArchStdEvent": "SAMPLE_FEED_OP",
+        "PublicDescription": "This event counts statistical profiling samples taken which are matching any operation type filters supported."
+    },
+    {
+        "ArchStdEvent": "SAMPLE_FEED_EVENT",
+        "PublicDescription": "This event counts statistical profiling samples taken which are matching event packet filter constraints."
+    },
+    {
+        "ArchStdEvent": "SAMPLE_FEED_LAT",
+        "PublicDescription": "This event counts statistical profiling samples taken which are exceeding minimum latency set by operation latency filter constraints."
+    }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/spec_operation.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/spec_operation.json
new file mode 100644
index 000000000000..8bc802f5f350
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/spec_operation.json
@@ -0,0 +1,230 @@
+[
+    {
+        "ArchStdEvent": "INST_SPEC",
+        "PublicDescription": "This event counts operations that have been speculatively executed."
+    },
+    {
+        "ArchStdEvent": "OP_SPEC",
+        "PublicDescription": "This event counts micro-operations speculatively executed. This is the count of the number of micro-operations dispatched in a cycle."
+    },
+    {
+        "ArchStdEvent": "UNALIGNED_LD_SPEC",
+        "PublicDescription": "This event counts unaligned memory Read operations issued by the CPU. This event counts unaligned accesses (as defined by the actual instruction), even if they are subsequently issued as multiple aligned accesses.\nThis event does not count preload operations (PLD, PLI).\nThis event is a subset of the UNALIGNED_LDST_SPEC event."
+    },
+    {
+        "ArchStdEvent": "UNALIGNED_ST_SPEC",
+        "PublicDescription": "This event counts unaligned memory Write operations issued by the CPU. This event counts unaligned accesses (as defined by the actual instruction), even if they are subsequently issued as multiple aligned accesses.\nThis event is a subset of the UNALIGNED_LDST_SPEC event."
+    },
+    {
+        "ArchStdEvent": "UNALIGNED_LDST_SPEC",
+        "PublicDescription": "This event counts unaligned memory operations issued by the CPU. This event counts unaligned accesses (as defined by the actual instruction), even if they are subsequently issued as multiple aligned accesses.\nThis event is the sum of the following events:\nUNALIGNED_ST_SPEC and\nUNALIGNED_LD_SPEC."
+    },
+    {
+        "ArchStdEvent": "LDREX_SPEC",
+        "PublicDescription": "This event counts Load-Exclusive operations that have been speculatively executed. For example: LDREX, LDX"
+    },
+    {
+        "ArchStdEvent": "STREX_PASS_SPEC",
+        "PublicDescription": "This event counts Store-exclusive operations that have been speculatively executed and have successfully completed the Store operation."
+    },
+    {
+        "ArchStdEvent": "STREX_FAIL_SPEC",
+        "PublicDescription": "This event counts Store-exclusive operations that have been speculatively executed and have not successfully completed the Store operation."
+    },
+    {
+        "ArchStdEvent": "STREX_SPEC",
+        "PublicDescription": "This event counts Store-exclusive operations that have been speculatively executed.\nThis event is the sum of the following events:\nSTREX_PASS_SPEC and\nSTREX_FAIL_SPEC."
+    },
+    {
+        "ArchStdEvent": "LD_SPEC",
+        "PublicDescription": "This event counts speculatively executed Load operations including Single Instruction Multiple Data (SIMD) Load operations."
+    },
+    {
+        "ArchStdEvent": "ST_SPEC",
+        "PublicDescription": "This event counts speculatively executed Store operations including Single Instruction Multiple Data (SIMD) Store operations."
+    },
+    {
+        "ArchStdEvent": "LDST_SPEC",
+        "PublicDescription": "This event counts Load and Store operations that have been speculatively executed."
+    },
+    {
+        "ArchStdEvent": "DP_SPEC",
+        "PublicDescription": "This event counts speculatively executed logical or arithmetic instructions such as MOV/MVN operations."
+    },
+    {
+        "ArchStdEvent": "ASE_SPEC",
+        "PublicDescription": "This event counts speculatively executed Advanced SIMD operations excluding Load, Store, and Move micro-operations that move data to or from SIMD (vector) registers."
+    },
+    {
+        "ArchStdEvent": "VFP_SPEC",
+        "PublicDescription": "This event counts speculatively executed floating point operations. This event does not count operations that move data to or from floating point (vector) registers."
+    },
+    {
+        "ArchStdEvent": "PC_WRITE_SPEC",
+        "PublicDescription": "This event counts speculatively executed operations which cause software changes of the PC. Those operations include all taken branch operations."
+    },
+    {
+        "ArchStdEvent": "CRYPTO_SPEC",
+        "PublicDescription": "This event counts speculatively executed cryptographic operations except for PMULL and VMULL operations."
+    },
+    {
+        "ArchStdEvent": "BR_IMMED_SPEC",
+        "PublicDescription": "This event counts direct branch operations which are speculatively executed."
+    },
+    {
+        "ArchStdEvent": "BR_RETURN_SPEC",
+        "PublicDescription": "This event counts procedure return operations (RET, RETAA and RETAB) which are speculatively executed."
+    },
+    {
+        "ArchStdEvent": "BR_INDIRECT_SPEC",
+        "PublicDescription": "This event counts indirect branch operations including procedure returns, which are speculatively executed. This includes operations that force a software change of the PC, other than exception-generating operations and direct branch instructions. Some examples of the instructions counted by this event include BR Xn, RET, etc."
+    },
+    {
+        "ArchStdEvent": "ISB_SPEC",
+        "PublicDescription": "This event counts ISB operations that are executed."
+    },
+    {
+        "ArchStdEvent": "DSB_SPEC",
+        "PublicDescription": "This event counts DSB operations that are speculatively issued to Load/Store unit in the CPU."
+    },
+    {
+        "ArchStdEvent": "DMB_SPEC",
+        "PublicDescription": "This event counts DMB operations that are speculatively issued to the Load/Store unit in the CPU. This event does not count implied barriers from Load-acquire/Store-release operations."
+    },
+    {
+        "ArchStdEvent": "CSDB_SPEC",
+        "PublicDescription": "This event counts CSDB operations that are speculatively issued to the Load/Store unit in the CPU. This event does not count implied barriers from Load-acquire/Store-release operations."
+    },
+    {
+        "ArchStdEvent": "RC_LD_SPEC",
+        "PublicDescription": "This event counts any Load acquire operations that are speculatively executed. For example: LDAR, LDARH, LDARB"
+    },
+    {
+        "ArchStdEvent": "RC_ST_SPEC",
+        "PublicDescription": "This event counts any Store release operations that are speculatively executed. For example: STLR, STLRH, STLRB"
+    },
+    {
+        "ArchStdEvent": "SIMD_INST_SPEC",
+        "PublicDescription": "This event counts speculatively executed operations that are SIMD or SVE vector operations or Advanced SIMD non-scalar operations."
+    },
+    {
+        "ArchStdEvent": "ASE_INST_SPEC",
+        "PublicDescription": "This event counts speculatively executed Advanced SIMD operations."
+    },
+    {
+        "ArchStdEvent": "SVE_INST_SPEC",
+        "PublicDescription": "This event counts speculatively executed operations that are SVE operations."
+    },
+    {
+        "ArchStdEvent": "INT_SPEC",
+        "PublicDescription": "This event counts speculatively executed integer arithmetic operations."
+    },
+    {
+        "ArchStdEvent": "SVE_PRED_SPEC",
+        "PublicDescription": "This event counts speculatively executed predicated SVE operations.\nThis counter also counts SVE operation due to instruction with Governing predicate operand that determines the Active elements that do not write to any SVE Z vector destination register using either zeroing or merging predicate. Thus, the operations due to instructions such as INCP, DECP, UQINCP, UQDECP, SQINCP, SQDECP and PNEXT, are counted by the SVE_PRED_* events."
+    },
+    {
+        "ArchStdEvent": "SVE_PRED_EMPTY_SPEC",
+        "PublicDescription": "This event counts speculatively executed predicated SVE operations with no active predicate elements.\nThis counter also counts SVE operation due to instruction with Governing predicate operand that determines the Active elements that do not write to any SVE Z vector destination register using either zeroing or merging predicate. Thus, the operations due to instructions such as INCP, DECP, UQINCP, UQDECP, SQINCP, SQDECP and PNEXT, are counted by the SVE_PRED_* events."
+    },
+    {
+        "ArchStdEvent": "SVE_PRED_FULL_SPEC",
+        "PublicDescription": "This event counts speculatively executed predicated SVE operations with all predicate elements active.\nThis counter also counts SVE operation due to instruction with Governing predicate operand that determines the Active elements that do not write to any SVE Z vector destination register using either zeroing or merging predicate. Thus, the operations due to instructions such as INCP, DECP, UQINCP, UQDECP, SQINCP, SQDECP and PNEXT, are counted by the SVE_PRED_* events."
+    },
+    {
+        "ArchStdEvent": "SVE_PRED_PARTIAL_SPEC",
+        "PublicDescription": "This event counts speculatively executed predicated SVE operations with at least one but not all active predicate elements.\nThis counter also counts SVE operation due to instruction with Governing predicate operand that determines the Active elements that do not write to any SVE Z vector destination register using either zeroing or merging predicate. Thus, the operations due to instructions such as INCP, DECP, UQINCP, UQDECP, SQINCP, SQDECP and PNEXT, are counted by the SVE_PRED_* events."
+    },
+    {
+        "ArchStdEvent": "SVE_PRED_NOT_FULL_SPEC",
+        "PublicDescription": "This event counts speculatively executed predicated SVE operations with at least one non active predicate elements.\nThis counter also counts SVE operation due to instruction with Governing predicate operand that determines the Active elements that do not write to any SVE Z vector destination register using either zeroing or merging predicate. Thus, the operations due to instructions such as INCP, DECP, UQINCP, UQDECP, SQINCP, SQDECP and PNEXT, are counted by the SVE_PRED_* events."
+    },
+    {
+        "ArchStdEvent": "PRF_SPEC",
+        "PublicDescription": "This event counts speculatively executed operations that prefetch memory. For example, Scalar: PRFM, SVE: PRFB, PRFD, PRFH, or PRFW."
+    },
+    {
+        "ArchStdEvent": "SVE_LDFF_SPEC",
+        "PublicDescription": "This event counts speculatively executed SVE first fault or non-fault Load operations."
+    },
+    {
+        "ArchStdEvent": "SVE_LDFF_FAULT_SPEC",
+        "PublicDescription": "This event counts speculatively executed SVE first fault or non-fault Load operations that clear at least one bit in the FFR."
+    },
+    {
+        "ArchStdEvent": "ASE_SVE_INT8_SPEC",
+        "PublicDescription": "This event counts speculatively executed Advanced SIMD or SVE integer operations with the largest data type being an 8-bit integer."
+    },
+    {
+        "ArchStdEvent": "ASE_SVE_INT16_SPEC",
+        "PublicDescription": "This event counts speculatively executed Advanced SIMD or SVE integer operations with the largest data type a 16-bit integer."
+    },
+    {
+        "ArchStdEvent": "ASE_SVE_INT32_SPEC",
+        "PublicDescription": "This event counts speculatively executed Advanced SIMD or SVE integer operations with the largest data type a 32-bit integer."
+    },
+    {
+        "ArchStdEvent": "ASE_SVE_INT64_SPEC",
+        "PublicDescription": "This event counts speculatively executed Advanced SIMD or SVE integer operations with the largest data type a 64-bit integer."
+    },
+    {
+        "EventCode": "0x011d",
+        "EventName": "SPEC_RET_STACK_FULL",
+        "PublicDescription": "This event counts predict pipe stalls due to speculative return address predictor full."
+    },
+    {
+        "EventCode": "0x011f",
+        "EventName": "MOPS_SPEC",
+        "PublicDescription": "Macro-ops speculatively decoded."
+    },
+    {
+        "EventCode": "0x0180",
+        "EventName": "BR_SPEC_PRED_TAKEN",
+        "PublicDescription": "Number of predicted taken from branch predictor."
+    },
+    {
+        "EventCode": "0x0181",
+        "EventName": "BR_SPEC_PRED_TAKEN_FROM_L2BTB",
+        "PublicDescription": "Number of predicted taken branch from L2 BTB."
+    },
+    {
+        "EventCode": "0x0182",
+        "EventName": "BR_SPEC_PRED_TAKEN_MULTI",
+        "PublicDescription": "Number of predicted taken for polymorphic branch."
+    },
+    {
+        "EventCode": "0x0185",
+        "EventName": "BR_SPEC_PRED_STATIC",
+        "PublicDescription": "Number of post fetch prediction."
+    },
+    {
+        "EventCode": "0x01d0",
+        "EventName": "TLBI_LOCAL_SPEC",
+        "PublicDescription": "A non-broadcast TLBI instruction executed (Speculatively or otherwise) on *this* PE."
+    },
+    {
+        "EventCode": "0x01d1",
+        "EventName": "TLBI_BROADCAST_SPEC",
+        "PublicDescription": "A broadcast TLBI instruction executed (Speculatively or otherwise) on *this* PE."
+    },
+    {
+        "EventCode": "0x01e7",
+        "EventName": "BR_SPEC_PRED_ALN_REDIR",
+        "PublicDescription": "BPU predict pipe align redirect (either AL-APQ hit/miss)."
+    },
+    {
+        "EventCode": "0x0200",
+        "EventName": "SIMD_CRYPTO_INST_SPEC",
+        "PublicDescription": "SIMD, SVE, and CRYPTO instructions speculatively decoded."
+    },
+    {
+        "EventCode": "0x022e",
+        "EventName": "VPRED_LD_SPEC",
+        "PublicDescription": "This event counts the number of Speculatively-executed-Load operations with addresses produced by the value-prediction mechanism. The loaded data might be discarded if the predicted address differs from the actual address."
+    },
+    {
+        "EventCode": "0x022f",
+        "EventName": "VPRED_LD_SPEC_MISMATCH",
+        "PublicDescription": "This event counts a subset of VPRED_LD_SPEC where the predicted Load address and the actual address mismatched."
+    }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/stall.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/stall.json
new file mode 100644
index 000000000000..92d9e0866c24
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/stall.json
@@ -0,0 +1,145 @@
+[
+    {
+        "ArchStdEvent": "STALL_FRONTEND",
+        "PublicDescription": "This event counts cycles when frontend could not send any micro-operations to the rename stage because of frontend resource stalls caused by fetch memory latency or branch prediction flow stalls. STALL_FRONTEND_SLOTS counts SLOTS during the cycle when this event counts. STALL_SLOT_FRONTEND will count SLOTS when this event is counted on this CPU."
+    },
+    {
+        "ArchStdEvent": "STALL_BACKEND",
+        "PublicDescription": "This event counts cycles whenever the rename unit is unable to send any micro-operations to the backend of the pipeline because of backend resource constraints. Backend resource constraints can include issue stage fullness, execution stage fullness, or other internal pipeline resource fullness. All the backend slots were empty during the cycle when this event counts."
+    },
+    {
+        "ArchStdEvent": "STALL",
+        "PublicDescription": "This event counts cycles when no operations are sent to the rename unit from the frontend or from the rename unit to the backend for any reason (either frontend or backend stall). This event is the sum of the following events:\nSTALL_FRONTEND and\nSTALL_BACKEND."
+    },
+    {
+        "ArchStdEvent": "STALL_SLOT_BACKEND",
+        "PublicDescription": "This event counts slots per cycle in which no operations are sent from the rename unit to the backend due to backend resource constraints. STALL_BACKEND counts during the cycle when STALL_SLOT_BACKEND counts at least 1. STALL_BACKEND counts during the cycle when STALL_SLOT_BACKEND is SLOTS."
+    },
+    {
+        "ArchStdEvent": "STALL_SLOT_FRONTEND",
+        "PublicDescription": "This event counts slots per cycle in which no operations are sent to the rename unit from the frontend due to frontend resource constraints. STALL_FRONTEND counts during the cycle when STALL_SLOT_FRONTEND is SLOTS."
+    },
+    {
+        "ArchStdEvent": "STALL_SLOT",
+        "PublicDescription": "This event counts slots per cycle in which no operations are sent to the rename unit from the frontend or from the rename unit to the backend for any reason (either frontend or backend stall).\nSTALL_SLOT is the sum of the following events:\nSTALL_SLOT_FRONTEND and\nSTALL_SLOT_BACKEND."
+    },
+    {
+        "ArchStdEvent": "STALL_BACKEND_MEM",
+        "PublicDescription": "This event counts cycles when the backend is stalled because there is a pending demand Load request in progress in the last level Core cache.\nLast level cache in this CPU is Level 2, hence this event counts same as STALL_BACKEND_L2D."
+    },
+    {
+        "ArchStdEvent": "STALL_FRONTEND_MEMBOUND",
+        "PublicDescription": "This event counts cycles when the frontend could not send any micro-operations to the rename stage due to resource constraints in the memory resources."
+    },
+    {
+        "ArchStdEvent": "STALL_FRONTEND_L1I",
+        "PublicDescription": "This event counts cycles when the frontend is stalled because there is an instruction fetch request pending in the L1 I-cache."
+    },
+    {
+        "ArchStdEvent": "STALL_FRONTEND_MEM",
+        "PublicDescription": "This event counts cycles when the frontend is stalled because there is an instruction fetch request pending in the last level Core cache.\nLast level cache in this CPU is Level 2, hence this event counts rather than STALL_FRONTEND_L2I."
+    },
+    {
+        "ArchStdEvent": "STALL_FRONTEND_TLB",
+        "PublicDescription": "This event counts when the frontend is stalled on any TLB misses being handled. This event also counts the TLB accesses made by hardware prefetches."
+    },
+    {
+        "ArchStdEvent": "STALL_FRONTEND_CPUBOUND",
+        "PublicDescription": "This event counts cycles when the frontend could not send any micro-operations to the rename stage due to resource constraints in the CPU resources excluding memory resources."
+    },
+    {
+        "ArchStdEvent": "STALL_FRONTEND_FLOW",
+        "PublicDescription": "This event counts cycles when the frontend could not send any micro-operations to the rename stage due to resource constraints in the branch prediction unit."
+    },
+    {
+        "ArchStdEvent": "STALL_FRONTEND_FLUSH",
+        "PublicDescription": "This event counts cycles when the frontend could not send any micro-operations to the rename stage as the frontend is recovering from a machine flush or resteer. Example scenarios that cause a flush include branch mispredictions, taken exceptions, microarchitectural flush etc."
+    },
+    {
+        "ArchStdEvent": "STALL_BACKEND_MEMBOUND",
+        "PublicDescription": "This event counts cycles when the backend could not accept any micro-operations due to resource constraints in the memory resources."
+    },
+    {
+        "ArchStdEvent": "STALL_BACKEND_L1D",
+        "PublicDescription": "This event counts cycles when the backend is stalled because there is a pending demand Load request in progress in the L1 D-cache."
+    },
+    {
+        "ArchStdEvent": "STALL_BACKEND_TLB",
+        "PublicDescription": "This event counts cycles when the backend is stalled on any demand TLB misses being handled."
+    },
+    {
+        "ArchStdEvent": "STALL_BACKEND_ST",
+        "PublicDescription": "This event counts cycles when the backend is stalled and there is a Store that has not reached the pre-commit stage."
+    },
+    {
+        "ArchStdEvent": "STALL_BACKEND_CPUBOUND",
+        "PublicDescription": "This event counts cycles when the backend could not accept any micro-operations due to any resource constraints in the CPU excluding memory resources."
+    },
+    {
+        "ArchStdEvent": "STALL_BACKEND_BUSY",
+        "PublicDescription": "This event counts cycles when the backend could not accept any micro-operations because the issue queues are full to take any operations for execution."
+    },
+    {
+        "ArchStdEvent": "STALL_BACKEND_ILOCK",
+        "PublicDescription": "This event counts cycles when the backend could not accept any micro-operations due to resource constraints imposed by input dependency."
+    },
+    {
+        "ArchStdEvent": "STALL_BACKEND_RENAME",
+        "PublicDescription": "This event counts cycles when backend is stalled even when operations are available from the frontend but at least one is not ready to be sent to the backend because no rename register is available."
+    },
+    {
+        "EventCode": "0x0158",
+        "EventName": "FLAG_DISP_STALL",
+        "PublicDescription": "Rename stalled due to FRF(Flag register file) full."
+    },
+    {
+        "EventCode": "0x0159",
+        "EventName": "GEN_DISP_STALL",
+        "PublicDescription": "Rename stalled due to GRF (General-purpose register file) full."
+    },
+    {
+        "EventCode": "0x015a",
+        "EventName": "VEC_DISP_STALL",
+        "PublicDescription": "Rename stalled due to VRF (Vector register file) full."
+    },
+    {
+        "EventCode": "0x015c",
+        "EventName": "SX_IQ_STALL",
+        "PublicDescription": "Dispatch stalled due to IQ full, SX."
+    },
+    {
+        "EventCode": "0x015d",
+        "EventName": "MX_IQ_STALL",
+        "PublicDescription": "Dispatch stalled due to IQ full, MX."
+    },
+    {
+        "EventCode": "0x015e",
+        "EventName": "LS_IQ_STALL",
+        "PublicDescription": "Dispatch stalled due to IQ full, LS."
+    },
+    {
+        "EventCode": "0x015f",
+        "EventName": "VX_IQ_STALL",
+        "PublicDescription": "Dispatch stalled due to IQ full, VX."
+    },
+    {
+        "EventCode": "0x0160",
+        "EventName": "MCQ_FULL_STALL",
+        "PublicDescription": "Dispatch stalled due to MCQ full."
+    },
+    {
+        "EventCode": "0x01cf",
+        "EventName": "PRD_DISP_STALL",
+        "PublicDescription": "Rename stalled due to predicate registers (physical) are full."
+    },
+    {
+        "EventCode": "0x01e0",
+        "EventName": "CSDB_STALL",
+        "PublicDescription": "Rename stalled due to CSDB."
+    },
+    {
+        "EventCode": "0x01e2",
+        "EventName": "STALL_SLOT_FRONTEND_WITHOUT_MISPRED",
+        "PublicDescription": "Stall slot frontend during non-mispredicted branch.\nThis event counts the STALL_STOT_FRONTEND Events, except for the 4 cycles following a mispredicted branch Event or 4 cycles following a commit flush&restart Event."
+    }
+]
diff --git a/tools/perf/pmu-events/arch/arm64/nvidia/t410/tlb.json b/tools/perf/pmu-events/arch/arm64/nvidia/t410/tlb.json
new file mode 100644
index 000000000000..18ec5c348c87
--- /dev/null
+++ b/tools/perf/pmu-events/arch/arm64/nvidia/t410/tlb.json
@@ -0,0 +1,158 @@
+[
+    {
+        "ArchStdEvent": "L1I_TLB_REFILL",
+        "PublicDescription": "This event counts L1 Instruction TLB refills from any instruction fetch (demand, hardware prefetch, and software preload accesses). If there are multiple misses in the TLB that are resolved by the refill, then this event only counts once. This event will not count if the translation table walk results in a fault (such as a translation or access fault), since there is no new translation created for the TLB."
+    },
+    {
+        "ArchStdEvent": "L1D_TLB_REFILL",
+        "PublicDescription": "This event counts L1 Data TLB accesses that resulted in TLB refills. If there are multiple misses in the TLB that are resolved by the refill, then this event only counts once. This event counts for refills caused by preload instructions or hardware prefetch accesses. This event counts regardless of whether the miss hits in L2 or results in a translation table walk. This event will not count if the translation table walk results in a fault (such as a translation or access fault), since there is no new translation created for the TLB. This event will not count on an access from an AT (Address Translation) instruction.\nThis event counts the sum of the following events:\nL1D_TLB_REFILL_RD and\nL1D_TLB_REFILL_WR."
+    },
+    {
+        "ArchStdEvent": "L1D_TLB",
+        "PublicDescription": "This event counts L1 Data TLB accesses caused by any memory Load or Store operation.\nNote that Load or Store instructions can be broken up into multiple memory operations.\nThis event does not count TLB maintenance operations."
+    },
+    {
+        "ArchStdEvent": "L1I_TLB",
+        "PublicDescription": "This event counts L1 instruction TLB accesses (caused by demand or hardware prefetch or software preload accesses), whether the access hits or misses in the TLB. This event counts both demand accesses and prefetch or preload generated accesses.\nThis event is a superset of the L1I_TLB_REFILL event."
+    },
+    {
+        "ArchStdEvent": "L2D_TLB_REFILL",
+        "PublicDescription": "This event counts L2 TLB refills caused by memory operations from both data and instruction fetch, except for those caused by TLB maintenance operations and hardware prefetches.\nThis event is the sum of the following events:\nL2D_TLB_REFILL_RD and\nL2D_TLB_REFILL_WR."
+    },
+    {
+        "ArchStdEvent": "L2D_TLB",
+        "PublicDescription": "This event counts L2 TLB accesses except those caused by TLB maintenance operations.\nThis event is the sum of the following events:\nL2D_TLB_RD and\nL2D_TLB_WR."
+    },
+    {
+        "ArchStdEvent": "DTLB_WALK",
+        "PublicDescription": "This event counts number of demand data translation table walks caused by a miss in the L2 TLB and performing at least one memory access. Translation table walks are counted even if the translation ended up taking a translation fault for reasons different than EPD, E0PD and NFD. Note that partial translations that cause a translation table walk are also counted. Also note that this event counts walks triggered by software preloads, but not walks triggered by hardware prefetchers, and that this event does not count walks triggered by TLB maintenance operations.\nThis event does not include prefetches."
+    },
+    {
+        "ArchStdEvent": "ITLB_WALK",
+        "PublicDescription": "This event counts number of instruction translation table walks caused by a miss in the L2 TLB and performing at least one memory access. Translation table walks are counted even if the translation ended up taking a translation fault for reasons different than EPD, E0PD and NFD. Note that partial translations that cause a translation table walk are also counted. Also note that this event does not count walks triggered by TLB maintenance operations.\nThis event does not include prefetches."
+    },
+    {
+        "ArchStdEvent": "L1D_TLB_REFILL_RD",
+        "PublicDescription": "This event counts L1 Data TLB refills caused by memory Read operations. If there are multiple misses in the TLB that are resolved by the refill, then this event only counts once. This event counts for refills caused by preload instructions or hardware prefetch accesses. This event counts regardless of whether the miss hits in L2 or results in a translation table walk. This event will not count if the translation table walk results in a fault (such as a translation or access fault), since there is no new translation created for the TLB. This event will not count on an access from an Address Translation (AT) instruction.\nThis event is a subset of the L1D_TLB_REFILL event."
+    },
+    {
+        "ArchStdEvent": "L1D_TLB_REFILL_WR",
+        "PublicDescription": "This event counts L1 Data TLB refills caused by data side memory Write operations. If there are multiple misses in the TLB that are resolved by the refill, then this event only counts once. This event counts for refills caused by preload instructions or hardware prefetch accesses. This event counts regardless of whether the miss hits in L2 or results in a translation table walk. This event will not count if the table walk results in a fault (such as a translation or access fault), since there is no new translation created for the TLB. This event will not count with an access from an Address Translation (AT) instruction.\nThis event is a subset of the L1D_TLB_REFILL event."
+    },
+    {
+        "ArchStdEvent": "L1D_TLB_RD",
+        "PublicDescription": "This event counts L1 Data TLB accesses caused by memory Read operations. This event counts whether the access hits or misses in the TLB. This event does not count TLB maintenance operations."
+    },
+    {
+        "ArchStdEvent": "L1D_TLB_WR",
+        "PublicDescription": "This event counts any L1 Data side TLB accesses caused by memory Write operations. This event counts whether the access hits or misses in the TLB. This event does not count TLB maintenance operations."
+    },
+    {
+        "ArchStdEvent": "L2D_TLB_REFILL_RD",
+        "PublicDescription": "This event counts L2 TLB refills caused by memory Read operations from both data and instruction fetch except for those caused by TLB maintenance operations or hardware prefetches.\nThis event is a subset of the L2D_TLB_REFILL event."
+    },
+    {
+        "ArchStdEvent": "L2D_TLB_REFILL_WR",
+        "PublicDescription": "This event counts L2 TLB refills caused by memory Write operations from both data and instruction fetch except for those caused by TLB maintenance operations.\nThis event is a subset of the L2D_TLB_REFILL event."
+    },
+    {
+        "ArchStdEvent": "L2D_TLB_RD",
+        "PublicDescription": "This event counts L2 TLB accesses caused by memory Read operations from both data and instruction fetch except for those caused by TLB maintenance operations.\nThis event is a subset of the L2D_TLB event."
+    },
+    {
+        "ArchStdEvent": "L2D_TLB_WR",
+        "PublicDescription": "This event counts L2 TLB accesses caused by memory Write operations from both data and instruction fetch except for those caused by TLB maintenance operations.\nThis event is a subset of the L2D_TLB event."
+    },
+    {
+        "ArchStdEvent": "DTLB_WALK_PERCYC",
+        "PublicDescription": "This event counts the number of data translation table walks in progress per cycle."
+    },
+    {
+        "ArchStdEvent": "ITLB_WALK_PERCYC",
+        "PublicDescription": "This event counts the number of instruction translation table walks in progress per cycle."
+    },
+    {
+        "ArchStdEvent": "L1D_TLB_RW",
+        "PublicDescription": "This event counts L1 Data TLB demand accesses caused by memory Read or Write operations. This event counts whether the access hits or misses in the TLB. This event does not count TLB maintenance operations."
+    },
+    {
+        "ArchStdEvent": "L1I_TLB_RD",
+        "PublicDescription": "This event counts L1 Instruction TLB demand accesses whether the access hits or misses in the TLB."
+    },
+    {
+        "ArchStdEvent": "L1D_TLB_PRFM",
+        "PublicDescription": "This event counts L1 Data TLB accesses generated by software prefetch or preload memory accesses. Load or Store instructions can be broken into multiple memory operations. This event does not count TLB maintenance operations."
+    },
+    {
+        "ArchStdEvent": "L1I_TLB_PRFM",
+        "PublicDescription": "This event counts L1 Instruction TLB accesses generated by software preload or prefetch instructions. This event counts whether the access hits or misses in the TLB. This event does not count TLB maintenance operations."
+    },
+    {
+        "ArchStdEvent": "DTLB_HWUPD",
+        "PublicDescription": "This event counts number of memory accesses triggered by a data translation table walk and performing an update of a translation table entry. Memory accesses are counted even if the translation ended up taking a translation fault for reasons different than EPD, E0PD and NFD. Note that this event counts accesses triggered by software preloads, but not accesses triggered by hardware prefetchers."
+    },
+    {
+        "ArchStdEvent": "ITLB_HWUPD",
+        "PublicDescription": "This event counts number of memory accesses triggered by an instruction translation table walk and performing an update of a translation table entry. Memory accesses are counted even if the translation ended up taking a translation fault for reasons different than EPD, E0PD and NFD."
+    },
+    {
+        "ArchStdEvent": "DTLB_STEP",
+        "PublicDescription": "This event counts number of memory accesses triggered by a demand data translation table walk and performing a Read of a translation table entry. Memory accesses are counted even if the translation ended up taking a translation fault for reasons different than EPD, E0PD and NFD.\nNote that this event counts accesses triggered by software preloads, but not accesses triggered by hardware prefetchers."
+    },
+    {
+        "ArchStdEvent": "ITLB_STEP",
+        "PublicDescription": "This event counts number of memory accesses triggered by an instruction translation table walk and performing a Read of a translation table entry. Memory accesses are counted even if the translation ended up taking a translation fault for reasons different than EPD, E0PD and NFD."
+    },
+    {
+        "ArchStdEvent": "DTLB_WALK_LARGE",
+        "PublicDescription": "This event counts number of demand data translation table walks caused by a miss in the L2 TLB and yielding a large page. The set of large pages is defined as all pages with a final size higher than or equal to 2MB. Translation table walks that end up taking a translation fault are not counted, as the page size would be undefined in that case. If DTLB_WALK_BLOCK is implemented, then it is an alias for this event in this family.\nNote that partial translations that cause a translation table walk are also counted.\nAlso note that this event counts walks triggered by software preloads, but not walks triggered by hardware prefetchers, and that this event does not count walks triggered by TLB maintenance operations."
+    },
+    {
+        "ArchStdEvent": "ITLB_WALK_LARGE",
+        "PublicDescription": "This event counts number of instruction translation table walks caused by a miss in the L2 TLB and yielding a large page. The set of large pages is defined as all pages with a final size higher than or equal to 2MB. Translation table walks that end up taking a translation fault are not counted, as the page size would be undefined in that case. In this family, this is equal to ITLB_WALK_BLOCK event.\nNote that partial translations that cause a translation table walk are also counted.\nAlso note that this event does not count walks triggered by TLB maintenance operations."
+    },
+    {
+        "ArchStdEvent": "DTLB_WALK_SMALL",
+        "PublicDescription": "This event counts number of data translation table walks caused by a miss in the L2 TLB and yielding a small page. The set of small pages is defined as all pages with a final size lower than 2MB. Translation table walks that end up taking a translation fault are not counted, as the page size would be undefined in that case. If DTLB_WALK_PAGE event is implemented, then it is an alias for this event in this family. Note that partial translations that cause a translation table walk are also counted.\nAlso note that this event counts walks triggered by software preloads, but not walks triggered by hardware prefetchers, and that this event does not count walks triggered by TLB maintenance operations."
+    },
+    {
+        "ArchStdEvent": "ITLB_WALK_SMALL",
+        "PublicDescription": "This event counts number of instruction translation table walks caused by a miss in the L2 TLB and yielding a small page. The set of small pages is defined as all pages with a final size lower than 2MB. Translation table walks that end up taking a translation fault are not counted, as the page size would be undefined in that case. In this family, this is equal to ITLB_WALK_PAGE event.\nNote that partial translations that cause a translation table walk are also counted.\nAlso note that this event does not count walks triggered by TLB maintenance operations."
+    },
+    {
+        "ArchStdEvent": "DTLB_WALK_RW",
+        "PublicDescription": "This event counts number of demand data translation table walks caused by a miss in the L2 TLB and performing at least one memory access. Translation table walks are counted even if the translation ended up taking a translation fault for reasons different than EPD, E0PD and NFD.\nNote that partial translations that cause a translation table walk are also counted.\nAlso note that this event does not count walks triggered by TLB maintenance operations."
+    },
+    {
+        "ArchStdEvent": "ITLB_WALK_RD",
+        "PublicDescription": "This event counts number of demand instruction translation table walks caused by a miss in the L2 TLB and performing at least one memory access. Translation table walks are counted even if the translation ended up taking a translation fault for reasons different than EPD, E0PD and NFD.\nNote that partial translations that cause a translation table walk are also counted.\nAlso note that this event does not count walks triggered by TLB maintenance operations."
+    },
+    {
+        "ArchStdEvent": "DTLB_WALK_PRFM",
+        "PublicDescription": "This event counts number of software prefetches or preloads generated data translation table walks caused by a miss in the L2 TLB and performing at least one memory access. Translation table walks are counted even if the translation ended up taking a translation fault for reasons different than EPD, E0PD and NFD.\nNote that partial translations that cause a translation table walk are also counted.\nAlso note that this event does not count walks triggered by TLB maintenance operations."
+    },
+    {
+        "ArchStdEvent": "ITLB_WALK_PRFM",
+        "PublicDescription": "This event counts number of software prefetches or preloads generated instruction translation table walks caused by a miss in the L2 TLB and performing at least one memory access. Translation table walks are counted even if the translation ended up taking a translation fault for reasons different than EPD, E0PD and NFD.\nNote that partial translations that cause a translation table walk are also counted.\nAlso note that this event does not count walks triggered by TLB maintenance operations."
+    },
+    {
+        "EventCode": "0x010e",
+        "EventName": "L1D_TLB_REFILL_RD_PF",
+        "PublicDescription": "L1 Data TLB refill, Read, prefetch."
+    },
+    {
+        "EventCode": "0x010f",
+        "EventName": "L2TLB_PF_REFILL",
+        "PublicDescription": "L2 Data TLB refill, Read, prefetch.\nThis event counts MMU refills due to internal PFStream requests."
+    },
+    {
+        "EventCode": "0x0223",
+        "EventName": "L1I_TLB_REFILL_RD",
+        "PublicDescription": "L1 Instruction TLB refills due to Demand miss."
+    },
+    {
+        "EventCode": "0x0224",
+        "EventName": "L1I_TLB_REFILL_PRFM",
+        "PublicDescription": "L1 Instruction TLB refills due to Software prefetch miss."
+    }
+]

From f182573e06abb635f320b0fd0e60972c4c2467c5 Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@huawei.com>
Date: Mon, 9 Mar 2026 17:44:12 +0000
Subject: [PATCH 023/131] perf tools: Add layout support for --symfs option

Add support for parsing an optional layout parameter in the --symfs
command line option. The format is:

  --symfs <directory[,layout]>

Where layout can be:
  - 'hierarchy': matches full path (default)
  - 'flat': only matches base name

When debugging symbol files from a copy of the filesystem (e.g., from a
container or remote machine), the debug files are often stored in a
flat directory structure with only filenames, not the full original
paths. In this case, using 'flat' layout allows perf to find debug
symbols by matching only the filename rather than the full path.

For example, given a binary path like:
  /build/output/lib/foo.so

With 'perf report --symfs /debug/files,flat', perf will look for:
  /debug/files/foo.so

Instead of:
  /debug/files/build/output/lib/foo.so

This is particularly useful when:
- Extracting debug files from containers with different directory layouts
- Working with build systems that flatten directory structures

Signed-off-by: Changbin Du <changbin.du@huawei.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/Documentation/perf-annotate.txt  |  7 +++--
 tools/perf/Documentation/perf-diff.txt      |  7 +++--
 tools/perf/Documentation/perf-kwork.txt     |  7 +++--
 tools/perf/Documentation/perf-probe.txt     |  6 ++++
 tools/perf/Documentation/perf-report.txt    |  7 +++--
 tools/perf/Documentation/perf-sched.txt     |  7 +++--
 tools/perf/Documentation/perf-script.txt    |  7 +++--
 tools/perf/Documentation/perf-timechart.txt |  7 +++--
 tools/perf/Documentation/tips.txt           |  2 +-
 tools/perf/builtin-annotate.c               |  3 +-
 tools/perf/builtin-diff.c                   |  3 +-
 tools/perf/builtin-kwork.c                  |  4 +--
 tools/perf/builtin-probe.c                  |  4 +--
 tools/perf/builtin-report.c                 |  3 +-
 tools/perf/builtin-sched.c                  |  4 +--
 tools/perf/builtin-script.c                 |  3 +-
 tools/perf/builtin-timechart.c              |  3 +-
 tools/perf/util/symbol.c                    | 35 ++++++++++++++++++---
 tools/perf/util/symbol.h                    | 18 +++++++++++
 tools/perf/util/symbol_conf.h               |  1 +
 20 files changed, 103 insertions(+), 35 deletions(-)

diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt
index 547f1a268018..a688738809c4 100644
--- a/tools/perf/Documentation/perf-annotate.txt
+++ b/tools/perf/Documentation/perf-annotate.txt
@@ -110,8 +110,11 @@ include::itrace.txt[]
 	Interleave source code with assembly code. Enabled by default,
 	disable with --no-source.
 
---symfs=<directory>::
-        Look for files with symbols relative to this directory.
+--symfs=<directory[,layout]>::
+        Look for files with symbols relative to this directory. The optional
+        layout can be 'hierarchy' (default, matches full path) or 'flat'
+        (only matches base name). This is useful when debug files are stored
+        in a flat directory structure.
 
 -M::
 --disassembler-style=:: Set disassembler style for objdump.
diff --git a/tools/perf/Documentation/perf-diff.txt b/tools/perf/Documentation/perf-diff.txt
index 58efab72d2e5..8e4a3f093135 100644
--- a/tools/perf/Documentation/perf-diff.txt
+++ b/tools/perf/Documentation/perf-diff.txt
@@ -81,8 +81,11 @@ OPTIONS
 --force::
         Don't do ownership validation.
 
---symfs=<directory>::
-        Look for files with symbols relative to this directory.
+--symfs=<directory[,layout]>::
+        Look for files with symbols relative to this directory. The optional
+        layout can be 'hierarchy' (default, matches full path) or 'flat'
+        (only matches base name). This is useful when debug files are stored
+        in a flat directory structure.
 
 -b::
 --baseline-only::
diff --git a/tools/perf/Documentation/perf-kwork.txt b/tools/perf/Documentation/perf-kwork.txt
index 21e607669d78..5c33a1fb2ffe 100644
--- a/tools/perf/Documentation/perf-kwork.txt
+++ b/tools/perf/Documentation/perf-kwork.txt
@@ -169,8 +169,11 @@ OPTIONS for 'perf kwork timehist'
 --max-stack::
 	Maximum number of functions to display in backtrace, default 5.
 
---symfs=<directory>::
-    Look for files with symbols relative to this directory.
+--symfs=<directory[,layout]>::
+        Look for files with symbols relative to this directory. The optional
+        layout can be 'hierarchy' (default, matches full path) or 'flat'
+        (only matches base name). This is useful when debug files are stored
+        in a flat directory structure.
 
 --time::
 	Only analyze samples within given time window: <start>,<stop>. Times
diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt
index 5c43a6edc0e5..2e5790325430 100644
--- a/tools/perf/Documentation/perf-probe.txt
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -50,6 +50,12 @@ OPTIONS
 --source=PATH::
 	Specify path to kernel source.
 
+--symfs=<directory[,layout]>::
+	Look for files with symbols relative to this directory. The optional
+	layout can be 'hierarchy' (default, matches full path) or 'flat'
+	(only matches base name). This is useful when debug files are stored
+	in a flat directory structure.
+
 -v::
 --verbose::
         Be more verbose (show parsed arguments, etc).
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index acef3ff4178e..802f931ae64d 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -368,8 +368,11 @@ OPTIONS
 --force::
         Don't do ownership validation.
 
---symfs=<directory>::
-        Look for files with symbols relative to this directory.
+--symfs=<directory[,layout]>::
+        Look for files with symbols relative to this directory. The optional
+        layout can be 'hierarchy' (default, matches full path) or 'flat'
+        (only matches base name). This is useful when debug files are stored
+        in a flat directory structure.
 
 -C::
 --cpu:: Only report samples for the list of CPUs provided. Multiple CPUs can
diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
index 4d9981609c04..a4221398e5e0 100644
--- a/tools/perf/Documentation/perf-sched.txt
+++ b/tools/perf/Documentation/perf-sched.txt
@@ -437,8 +437,11 @@ OPTIONS for 'perf sched timehist'
     Show all scheduling events followed by a summary by thread with min,
     max, and average run times (in sec) and relative stddev.
 
---symfs=<directory>::
-    Look for files with symbols relative to this directory.
+--symfs=<directory[,layout]>::
+        Look for files with symbols relative to this directory. The optional
+        layout can be 'hierarchy' (default, matches full path) or 'flat'
+        (only matches base name). This is useful when debug files are stored
+        in a flat directory structure.
 
 -V::
 --cpu-visual::
diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index ddf92f9c7821..200ea25891d8 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -307,8 +307,11 @@ OPTIONS
 --kallsyms=<file>::
         kallsyms pathname
 
---symfs=<directory>::
-        Look for files with symbols relative to this directory.
+--symfs=<directory[,layout]>::
+        Look for files with symbols relative to this directory. The optional
+        layout can be 'hierarchy' (default, matches full path) or 'flat'
+        (only matches base name). This is useful when debug files are stored
+        in a flat directory structure.
 
 -G::
 --hide-call-graph::
diff --git a/tools/perf/Documentation/perf-timechart.txt b/tools/perf/Documentation/perf-timechart.txt
index ef2281c56743..bacc5df3c400 100644
--- a/tools/perf/Documentation/perf-timechart.txt
+++ b/tools/perf/Documentation/perf-timechart.txt
@@ -53,8 +53,11 @@ TIMECHART OPTIONS
 -f::
 --force::
 	Don't complain, do it.
---symfs=<directory>::
-        Look for files with symbols relative to this directory.
+--symfs=<directory[,layout]>::
+        Look for files with symbols relative to this directory. The optional
+        layout can be 'hierarchy' (default, matches full path) or 'flat'
+        (only matches base name). This is useful when debug files are stored
+        in a flat directory structure.
 -n::
 --proc-num::
         Print task info for at least given number of tasks.
diff --git a/tools/perf/Documentation/tips.txt b/tools/perf/Documentation/tips.txt
index 3fee9b2a88ea..ebf12a8c5db5 100644
--- a/tools/perf/Documentation/tips.txt
+++ b/tools/perf/Documentation/tips.txt
@@ -11,7 +11,7 @@ Search options using a keyword: perf report -h <keyword>
 Use parent filter to see specific call path: perf report -p <regex>
 List events using substring match: perf list <keyword>
 To see list of saved events and attributes: perf evlist -v
-Use --symfs <dir> if your symbol files are in non-standard locations
+Use --symfs <dir>[,layout] if your symbol files are in non-standard locations.
 To see callchains in a more compact form: perf report -g folded
 To see call chains by final symbol taking CPU time (bottom up) use perf report -G
 Show individual samples with: perf script
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 9c27bb30b708..686ad08561d6 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -744,8 +744,7 @@ int cmd_annotate(int argc, const char **argv)
 			&annotate.group_set,
 			"Show event group information together"),
 	OPT_STRING('C', "cpu", &annotate.cpu_list, "cpu", "list of cpus to profile"),
-	OPT_CALLBACK(0, "symfs", NULL, "directory",
-		     "Look for files with symbols relative to this directory",
+	OPT_CALLBACK(0, "symfs", NULL, "directory[,layout]", SYMFS_HELP,
 		     symbol__config_symfs),
 	OPT_BOOLEAN(0, "source", &annotate_opts.annotate_src,
 		    "Interleave source code with assembly code (default)"),
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 59bf1f72d12e..69069926dd0b 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -1280,8 +1280,7 @@ static const struct option options[] = {
 	OPT_STRING_NOEMPTY('t', "field-separator", &symbol_conf.field_sep, "separator",
 		   "separator for columns, no spaces will be added between "
 		   "columns '.' is reserved."),
-	OPT_CALLBACK(0, "symfs", NULL, "directory",
-		     "Look for files with symbols relative to this directory",
+	OPT_CALLBACK(0, "symfs", NULL, "directory[,layout]", SYMFS_HELP,
 		     symbol__config_symfs),
 	OPT_UINTEGER('o', "order", &sort_compute, "Specify compute sorting."),
 	OPT_CALLBACK(0, "percentage", NULL, "relative|absolute",
diff --git a/tools/perf/builtin-kwork.c b/tools/perf/builtin-kwork.c
index 7f3068264568..6f94a8f45f60 100644
--- a/tools/perf/builtin-kwork.c
+++ b/tools/perf/builtin-kwork.c
@@ -2423,8 +2423,8 @@ int cmd_kwork(int argc, const char **argv)
 		    "Display call chains if present"),
 	OPT_UINTEGER(0, "max-stack", &kwork.max_stack,
 		   "Maximum number of functions to display backtrace."),
-	OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
-		    "Look for files with symbols relative to this directory"),
+	OPT_CALLBACK(0, "symfs", NULL, "directory[,layout]", SYMFS_HELP,
+		     symbol__config_symfs),
 	OPT_STRING(0, "time", &kwork.time_str, "str",
 		   "Time span for analysis (start,stop)"),
 	OPT_STRING('C', "cpu", &kwork.cpu_list, "cpu",
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index 1b4ba85ee019..a67b565278ae 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -597,8 +597,8 @@ __cmd_probe(int argc, const char **argv)
 	OPT_BOOLEAN(0, "demangle-kernel", &symbol_conf.demangle_kernel,
 		    "Enable kernel symbol demangling"),
 	OPT_BOOLEAN(0, "cache", &probe_conf.cache, "Manipulate probe cache"),
-	OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
-		   "Look for files with symbols relative to this directory"),
+	OPT_CALLBACK(0, "symfs", NULL, "directory[,layout]", SYMFS_HELP,
+		     symbol__config_symfs),
 	OPT_CALLBACK(0, "target-ns", NULL, "pid",
 		     "target pid for namespace contexts", opt_set_target_ns),
 	OPT_BOOLEAN(0, "bootconfig", &probe_conf.bootconfig,
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 3b81f4b3dc49..343c0ada5ea1 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -1416,8 +1416,7 @@ int cmd_report(int argc, const char **argv)
 		   "columns '.' is reserved."),
 	OPT_BOOLEAN('U', "hide-unresolved", &symbol_conf.hide_unresolved,
 		    "Only display entries resolved to a symbol"),
-	OPT_CALLBACK(0, "symfs", NULL, "directory",
-		     "Look for files with symbols relative to this directory",
+	OPT_CALLBACK(0, "symfs", NULL, "directory[,layout]", SYMFS_HELP,
 		     symbol__config_symfs),
 	OPT_STRING('C', "cpu", &report.cpu_list, "cpu",
 		   "list of cpus to profile"),
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 3f509cfdd58c..d083e2bb7703 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -4879,8 +4879,8 @@ int cmd_sched(int argc, const char **argv)
 		    "Display call chains if present (default on)"),
 	OPT_UINTEGER(0, "max-stack", &sched.max_stack,
 		   "Maximum number of functions to display backtrace."),
-	OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
-		    "Look for files with symbols relative to this directory"),
+	OPT_CALLBACK(0, "symfs", NULL, "directory[,layout]", SYMFS_HELP,
+		     symbol__config_symfs),
 	OPT_BOOLEAN('s', "summary", &sched.summary_only,
 		    "Show only syscall summary with statistics"),
 	OPT_BOOLEAN('S', "with-summary", &sched.summary,
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 9f8b0fd27a0a..b80c406d1fc1 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -4078,8 +4078,7 @@ int cmd_script(int argc, const char **argv)
 		   "file", "kallsyms pathname"),
 	OPT_BOOLEAN('G', "hide-call-graph", &no_callchain,
 		    "When printing symbols do not display call chain"),
-	OPT_CALLBACK(0, "symfs", NULL, "directory",
-		     "Look for files with symbols relative to this directory",
+	OPT_CALLBACK(0, "symfs", NULL, "directory[,layout]", SYMFS_HELP,
 		     symbol__config_symfs),
 	OPT_CALLBACK('F', "fields", NULL, "str",
 		     "comma separated output fields prepend with 'type:'. "
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index f8b49d69e9a5..28f33e39895d 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -1951,8 +1951,7 @@ int cmd_timechart(int argc, const char **argv)
 	OPT_CALLBACK('p', "process", NULL, "process",
 		      "process selector. Pass a pid or process name.",
 		       parse_process),
-	OPT_CALLBACK(0, "symfs", NULL, "directory",
-		     "Look for files with symbols relative to this directory",
+	OPT_CALLBACK(0, "symfs", NULL, "directory[,layout]", SYMFS_HELP,
 		     symbol__config_symfs),
 	OPT_INTEGER('n', "proc-num", &tchart.proc_num,
 		    "min. number of tasks to print"),
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 8662001e1e25..bd811b2b7890 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -66,6 +66,7 @@ struct symbol_conf symbol_conf = {
 	.time_quantum		= 100 * NSEC_PER_MSEC, /* 100ms */
 	.show_hist_headers	= true,
 	.symfs			= "",
+	.symfs_layout_flat	= false,
 	.event_group		= true,
 	.inline_name		= true,
 	.res_sample		= 0,
@@ -2491,16 +2492,42 @@ int symbol__config_symfs(const struct option *opt __maybe_unused,
 			 const char *dir, int unset __maybe_unused)
 {
 	char *bf = NULL;
+	char *layout_str;
+	char *dir_copy;
 	int ret;
 
-	symbol_conf.symfs = strdup(dir);
-	if (symbol_conf.symfs == NULL)
-		return -ENOMEM;
+	layout_str = strrchr(dir, ',');
+	if (layout_str) {
+		size_t dir_len = layout_str - dir;
+
+		dir_copy = strndup(dir, dir_len);
+		if (dir_copy == NULL)
+			return -ENOMEM;
+
+		symbol_conf.symfs = dir_copy;
+
+		layout_str++;
+		if (!strcmp(layout_str, "flat"))
+			symbol_conf.symfs_layout_flat = true;
+		else if (!strcmp(layout_str, "hierarchy"))
+			symbol_conf.symfs_layout_flat = false;
+		else {
+			pr_err("Invalid layout: '%s', use 'hierarchy' or 'flat'\n",
+			       layout_str);
+			free(dir_copy);
+			return -EINVAL;
+		}
+	} else {
+		symbol_conf.symfs = strdup(dir);
+		if (symbol_conf.symfs == NULL)
+			return -ENOMEM;
+		symbol_conf.symfs_layout_flat = false;
+	}
 
 	/* skip the locally configured cache if a symfs is given, and
 	 * config buildid dir to symfs/.debug
 	 */
-	ret = asprintf(&bf, "%s/%s", dir, ".debug");
+	ret = asprintf(&bf, "%s/%s", symbol_conf.symfs, ".debug");
 	if (ret < 0)
 		return -ENOMEM;
 
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 3fb5d146d9b1..4f1dbd1ebd99 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -9,6 +9,7 @@
 #include <linux/list.h>
 #include <linux/rbtree.h>
 #include <stdio.h>
+#include <errno.h>
 #include "addr_location.h"
 #include "path.h"
 #include "symbol_conf.h"
@@ -96,6 +97,18 @@ struct intlist;
 
 static inline int __symbol__join_symfs(char *bf, size_t size, const char *path)
 {
+	if (symbol_conf.symfs_layout_flat) {
+		char *path_copy = strdup(path);
+		char *base;
+		int ret;
+
+		if (!path_copy)
+			return -ENOMEM;
+		base = basename(path_copy);
+		ret = path__join(bf, size, symbol_conf.symfs, base);
+		free(path_copy);
+		return ret;
+	}
 	return path__join(bf, size, symbol_conf.symfs, path);
 }
 
@@ -169,6 +182,11 @@ size_t symbol__fprintf_symname(const struct symbol *sym, FILE *fp);
 size_t symbol__fprintf(struct symbol *sym, FILE *fp);
 bool symbol__restricted_filename(const char *filename,
 				 const char *restricted_filename);
+
+#define SYMFS_HELP "setup root directory which contains debug files:\n" \
+	"\t\t\t\t" "directory:\tLook for files with symbols relative to this directory.\n" \
+	"\t\t\t\t" "layout:   \tLayout of files, 'hierarchy' matches full path (default), 'flat' only matches base name.\n"
+
 int symbol__config_symfs(const struct option *opt __maybe_unused,
 			 const char *dir, int unset __maybe_unused);
 
diff --git a/tools/perf/util/symbol_conf.h b/tools/perf/util/symbol_conf.h
index 71bb17372a6c..ac1b444a8fd8 100644
--- a/tools/perf/util/symbol_conf.h
+++ b/tools/perf/util/symbol_conf.h
@@ -93,6 +93,7 @@ struct symbol_conf {
 			*tid_list,
 			*addr_list;
 	const char	*symfs;
+	bool		symfs_layout_flat;
 	int		res_sample;
 	int		pad_output_len_dso;
 	int		group_sort_idx;

From ad2f6258dd1d484f328d5cdcc1bc760419636cb2 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 6 Mar 2026 16:22:22 -0800
Subject: [PATCH 024/131] perf disasm: Fix potential use-after-free on fileloc

The fileloc is a copy of a pointer to a string but in places like
symbol_disassemble__llvm this string appears to be freed setting up
potential use-after-frees:

llvm.c:
```
		dl = disasm_line__new(args);
		if (dl == NULL)
			goto err;

		annotation_line__add(&dl->al, &notes->src->source);

		free(args->fileloc);
```

disasm.c:
```
static void annotation_line__init(struct annotation_line *al,
				  struct annotate_args *args,
				  int nr)
{
	al->offset = args->offset;
	al->line = strdup(args->line);
	al->line_nr = args->line_nr;
	al->fileloc = args->fileloc;
	al->data_nr = nr;
}

struct disasm_line *disasm_line__new(struct annotate_args *args)
{
	struct disasm_line *dl = NULL;
	struct annotation *notes = symbol__annotation(args->ms->sym);
	int nr = notes->src->nr_events;

	dl = zalloc(disasm_line_size(nr));
	if (!dl)
		return NULL;

	annotation_line__init(&dl->al, args, nr);
```

Fix this by making the fileloc a copy of the underlying string in its
init/exit.

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/disasm.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index ddcc488f2e5f..3fcb3634a7e0 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -908,13 +908,14 @@ static void annotation_line__init(struct annotation_line *al,
 	al->offset = args->offset;
 	al->line = strdup(args->line);
 	al->line_nr = args->line_nr;
-	al->fileloc = args->fileloc;
+	al->fileloc = args->fileloc ? strdup(args->fileloc) : NULL;
 	al->data_nr = nr;
 }
 
 static void annotation_line__exit(struct annotation_line *al)
 {
 	zfree_srcline(&al->path);
+	zfree(&al->fileloc);
 	zfree(&al->line);
 	zfree(&al->cycles);
 	zfree(&al->br_cntr);
@@ -950,7 +951,7 @@ struct disasm_line *disasm_line__new(struct annotate_args *args)
 
 	annotation_line__init(&dl->al, args, nr);
 	if (dl->al.line == NULL)
-		goto out_delete;
+		goto out_free_line;
 
 	if (args->offset != -1) {
 		if (arch__is_powerpc(args->arch)) {
@@ -965,8 +966,7 @@ struct disasm_line *disasm_line__new(struct annotate_args *args)
 	return dl;
 
 out_free_line:
-	zfree(&dl->al.line);
-out_delete:
+	annotation_line__exit(&dl->al);
 	free(dl);
 	return NULL;
 }

From c969a9d7bbf46f983c4a48566b3b2f7340b02296 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 12 Mar 2026 15:31:31 -0700
Subject: [PATCH 025/131] perf branch: Avoid incrementing NULL

If the entry is NULL the value is meaningless so early return NULL to
avoid an increment of NULL. This was happening in calls from
has_stitched_lbr when running the "perf record LBR tests". The return
value isn't used in that case, so returning NULL as no effect.

Fixes: 42bbabed09ce ("perf tools: Add hw_idx in struct branch_stack")
Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/branch.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/perf/util/branch.h b/tools/perf/util/branch.h
index 7429530fa774..a1d4736497c4 100644
--- a/tools/perf/util/branch.h
+++ b/tools/perf/util/branch.h
@@ -66,6 +66,9 @@ static inline struct branch_entry *perf_sample__branch_entries(struct perf_sampl
 {
 	u64 *entry = (u64 *)sample->branch_stack;
 
+	if (entry == NULL)
+		return NULL;
+
 	entry++;
 	if (sample->no_hw_idx)
 		return (struct branch_entry *)entry;

From ed09766cd0bff29a537c6262a2dfca3643c2f6e6 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Sat, 7 Feb 2026 00:24:24 -0800
Subject: [PATCH 026/131] perf symbol: Reduce scope of
 elf__needs_adjust_symbols

Function is only used by symsrc__init in symbol-elf.c, make static to
reduce scope. Switch to not passing the argument by value but as a
pointer.

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/symbol-elf.c | 8 ++++----
 tools/perf/util/symbol.h     | 1 -
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index 76912c62b6a0..d7582dbf379e 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -1054,15 +1054,15 @@ void symsrc__destroy(struct symsrc *ss)
 	close(ss->fd);
 }
 
-bool elf__needs_adjust_symbols(GElf_Ehdr ehdr)
+static bool elf__needs_adjust_symbols(const GElf_Ehdr *ehdr)
 {
 	/*
 	 * Usually vmlinux is an ELF file with type ET_EXEC for most
 	 * architectures; except Arm64 kernel is linked with option
 	 * '-share', so need to check type ET_DYN.
 	 */
-	return ehdr.e_type == ET_EXEC || ehdr.e_type == ET_REL ||
-	       ehdr.e_type == ET_DYN;
+	return ehdr->e_type == ET_EXEC || ehdr->e_type == ET_REL ||
+	       ehdr->e_type == ET_DYN;
 }
 
 static Elf *read_gnu_debugdata(struct dso *dso, Elf *elf, const char *name, int *fd_ret)
@@ -1235,7 +1235,7 @@ int symsrc__init(struct symsrc *ss, struct dso *dso, const char *name,
 	if (dso__kernel(dso) == DSO_SPACE__USER)
 		ss->adjust_symbols = true;
 	else
-		ss->adjust_symbols = elf__needs_adjust_symbols(ehdr);
+		ss->adjust_symbols = elf__needs_adjust_symbols(&ehdr);
 
 	ss->name   = strdup(name);
 	if (!ss->name) {
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 4f1dbd1ebd99..c67814d6d6d6 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -235,7 +235,6 @@ int setup_intlist(struct intlist **list, const char *list_str,
 		  const char *list_name);
 
 #ifdef HAVE_LIBELF_SUPPORT
-bool elf__needs_adjust_symbols(GElf_Ehdr ehdr);
 void arch__sym_update(struct symbol *s, GElf_Sym *sym);
 #endif
 

From 8e6f3103c079d44b51177449cd93af4c18733194 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Sat, 7 Feb 2026 00:24:25 -0800
Subject: [PATCH 027/131] perf dump-insn: Remove dump-insn.c

dump_insn and arch_is_uncond_branch are declared in
intel-pt-insn-decoder.c which is unconditionally part of all perf
builds. Don't declare weak versions of these symbols that will be unused.

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/Build       |  1 -
 tools/perf/util/dump-insn.c | 23 -----------------------
 2 files changed, 24 deletions(-)
 delete mode 100644 tools/perf/util/dump-insn.c

diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index bcccad7487a9..89de23dec401 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -149,7 +149,6 @@ endif
 perf-util-y += cs-etm-base.o
 
 perf-util-y += parse-branch-options.o
-perf-util-y += dump-insn.o
 perf-util-y += parse-regs-options.o
 perf-util-y += parse-sublevel-options.o
 perf-util-y += term.o
diff --git a/tools/perf/util/dump-insn.c b/tools/perf/util/dump-insn.c
deleted file mode 100644
index c1cc0ade48d0..000000000000
--- a/tools/perf/util/dump-insn.c
+++ /dev/null
@@ -1,23 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/compiler.h>
-#include "dump-insn.h"
-
-/* Fallback code */
-
-__weak
-const char *dump_insn(struct perf_insn *x __maybe_unused,
-		      u64 ip __maybe_unused, u8 *inbuf __maybe_unused,
-		      int inlen __maybe_unused, int *lenp)
-{
-	if (lenp)
-		*lenp = 0;
-	return "?";
-}
-
-__weak
-int arch_is_uncond_branch(const unsigned char *buf __maybe_unused,
-		   size_t len __maybe_unused,
-		   int x86_64 __maybe_unused)
-{
-	return 0;
-}

From 2907fd820b8f1e4563ecd624989fd5a4db479c2f Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Sat, 7 Feb 2026 00:24:26 -0800
Subject: [PATCH 028/131] perf tool: Constify the command and option arrays

Reduce scope and capture immutability.

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/perf.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index f475a8664ffc..1f51e8de6b1b 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -48,7 +48,7 @@ struct cmd_struct {
 	int option;
 };
 
-static struct cmd_struct commands[] = {
+static const struct cmd_struct commands[] = {
 	{ "archive",	NULL,	0 },
 	{ "buildid-cache", cmd_buildid_cache, 0 },
 	{ "buildid-list", cmd_buildid_list, 0 },
@@ -178,7 +178,7 @@ static int set_debug_file(const char *path)
 	return 0;
 }
 
-struct option options[] = {
+static const struct option options[] = {
 	OPT_ARGUMENT("help", "help"),
 	OPT_ARGUMENT("version", "version"),
 	OPT_ARGUMENT("exec-path", "exec-path"),
@@ -280,7 +280,7 @@ static int handle_options(const char ***argv, int *argc, int *envchanged)
 			unsigned int i;
 
 			for (i = 0; i < ARRAY_SIZE(commands); i++) {
-				struct cmd_struct *p = commands+i;
+				const struct cmd_struct *p = commands + i;
 				printf("%s ", p->cmd);
 			}
 			putchar('\n');
@@ -289,7 +289,7 @@ static int handle_options(const char ***argv, int *argc, int *envchanged)
 			unsigned int i;
 
 			for (i = 0; i < ARRAY_SIZE(options)-1; i++) {
-				struct option *p = options+i;
+				const struct option *p = options + i;
 				printf("--%s ", p->long_name);
 			}
 			putchar('\n');
@@ -331,7 +331,7 @@ static int handle_options(const char ***argv, int *argc, int *envchanged)
 #define RUN_SETUP	(1<<0)
 #define USE_PAGER	(1<<1)
 
-static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
+static int run_builtin(const struct cmd_struct *p, int argc, const char **argv)
 {
 	int status;
 	struct stat st;
@@ -390,7 +390,7 @@ static void handle_internal_command(int argc, const char **argv)
 	}
 
 	for (i = 0; i < ARRAY_SIZE(commands); i++) {
-		struct cmd_struct *p = commands+i;
+		const struct cmd_struct *p = commands+i;
 		if (p->fn == NULL)
 			continue;
 		if (strcmp(p->cmd, cmd))

From 5cd621dead2b1fe71afa723f73904242a1bd01a8 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Sat, 7 Feb 2026 00:24:27 -0800
Subject: [PATCH 029/131] perf bpf_map: Remove unused code

bpf_map__fprintf is unused so delete it, the header file declaring it
and the now unused static helper functions.

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/builtin-trace.c |  1 -
 tools/perf/util/Build      |  1 -
 tools/perf/util/bpf_map.c  | 70 --------------------------------------
 tools/perf/util/bpf_map.h  | 23 -------------
 4 files changed, 95 deletions(-)
 delete mode 100644 tools/perf/util/bpf_map.c
 delete mode 100644 tools/perf/util/bpf_map.h

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 7ff85fa90d98..1c38f3d16a31 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -21,7 +21,6 @@
 #include <bpf/libbpf.h>
 #include <bpf/btf.h>
 #endif
-#include "util/bpf_map.h"
 #include "util/rlimit.h"
 #include "builtin.h"
 #include "util/cgroup.h"
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 89de23dec401..70cc91d00804 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -170,7 +170,6 @@ perf-util-y += mutex.o
 perf-util-y += sharded_mutex.o
 perf-util-y += intel-tpebs.o
 
-perf-util-$(CONFIG_LIBBPF) += bpf_map.o
 perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf_counter.o
 perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf_counter_cgroup.o
 perf-util-$(CONFIG_PERF_BPF_SKEL) += bpf_ftrace.o
diff --git a/tools/perf/util/bpf_map.c b/tools/perf/util/bpf_map.c
deleted file mode 100644
index 442f91b4e8e1..000000000000
--- a/tools/perf/util/bpf_map.c
+++ /dev/null
@@ -1,70 +0,0 @@
-// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
-
-#include "util/bpf_map.h"
-#include <bpf/bpf.h>
-#include <bpf/libbpf.h>
-#include <linux/err.h>
-#include <linux/kernel.h>
-#include <errno.h>
-#include <stdbool.h>
-#include <stdlib.h>
-#include <unistd.h>
-
-static bool bpf_map__is_per_cpu(enum bpf_map_type type)
-{
-	return type == BPF_MAP_TYPE_PERCPU_HASH ||
-	       type == BPF_MAP_TYPE_PERCPU_ARRAY ||
-	       type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
-	       type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE;
-}
-
-static void *bpf_map__alloc_value(const struct bpf_map *map)
-{
-	if (bpf_map__is_per_cpu(bpf_map__type(map)))
-		return malloc(round_up(bpf_map__value_size(map), 8) *
-			      sysconf(_SC_NPROCESSORS_CONF));
-
-	return malloc(bpf_map__value_size(map));
-}
-
-int bpf_map__fprintf(struct bpf_map *map, FILE *fp)
-{
-	void *prev_key = NULL, *key, *value;
-	int fd = bpf_map__fd(map), err;
-	int printed = 0;
-
-	if (fd < 0)
-		return fd;
-
-	err = -ENOMEM;
-	key = malloc(bpf_map__key_size(map));
-	if (key == NULL)
-		goto out;
-
-	value = bpf_map__alloc_value(map);
-	if (value == NULL)
-		goto out_free_key;
-
-	while ((err = bpf_map_get_next_key(fd, prev_key, key) == 0)) {
-		int intkey = *(int *)key;
-
-		if (!bpf_map_lookup_elem(fd, key, value)) {
-			bool boolval = *(bool *)value;
-			if (boolval)
-				printed += fprintf(fp, "[%d] = %d,\n", intkey, boolval);
-		} else {
-			printed += fprintf(fp, "[%d] = ERROR,\n", intkey);
-		}
-
-		prev_key = key;
-	}
-
-	if (err == ENOENT)
-		err = printed;
-
-	free(value);
-out_free_key:
-	free(key);
-out:
-	return err;
-}
diff --git a/tools/perf/util/bpf_map.h b/tools/perf/util/bpf_map.h
deleted file mode 100644
index c2f7c13cba23..000000000000
--- a/tools/perf/util/bpf_map.h
+++ /dev/null
@@ -1,23 +0,0 @@
-// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
-#ifndef __PERF_BPF_MAP_H
-#define __PERF_BPF_MAP_H 1
-
-#include <stdio.h>
-struct bpf_map;
-
-#ifdef HAVE_LIBBPF_SUPPORT
-
-int bpf_map__fprintf(struct bpf_map *map, FILE *fp);
-
-#else
-
-#include <linux/compiler.h>
-
-static inline int bpf_map__fprintf(struct bpf_map *map __maybe_unused, FILE *fp __maybe_unused)
-{
-	return 0;
-}
-
-#endif // HAVE_LIBBPF_SUPPORT
-
-#endif // __PERF_BPF_MAP_H

From bb551508e78c886e6d3bcca6c744d3bc3fd8ad59 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Sat, 7 Feb 2026 00:24:28 -0800
Subject: [PATCH 030/131] perf record: Remove unused cpu-set-sched.h

Header file declares unused macros, so remove.

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/builtin-record.c     |  1 -
 tools/perf/util/cpu-set-sched.h | 50 ---------------------------------
 2 files changed, 51 deletions(-)
 delete mode 100644 tools/perf/util/cpu-set-sched.h

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 60d764068302..40917a0be238 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -40,7 +40,6 @@
 #include "util/perf_api_probe.h"
 #include "util/trigger.h"
 #include "util/perf-hooks.h"
-#include "util/cpu-set-sched.h"
 #include "util/synthetic-events.h"
 #include "util/time-utils.h"
 #include "util/units.h"
diff --git a/tools/perf/util/cpu-set-sched.h b/tools/perf/util/cpu-set-sched.h
deleted file mode 100644
index 8cf4e40d322a..000000000000
--- a/tools/perf/util/cpu-set-sched.h
+++ /dev/null
@@ -1,50 +0,0 @@
-// SPDX-License-Identifier: LGPL-2.1
-// Definitions taken from glibc for use with older systems, same licensing.
-#ifndef _CPU_SET_SCHED_PERF_H
-#define _CPU_SET_SCHED_PERF_H
-
-#include <features.h>
-#include <sched.h>
-
-#ifndef CPU_EQUAL
-#ifndef __CPU_EQUAL_S
-#if __GNUC_PREREQ (2, 91)
-# define __CPU_EQUAL_S(setsize, cpusetp1, cpusetp2) \
-  (__builtin_memcmp (cpusetp1, cpusetp2, setsize) == 0)
-#else
-# define __CPU_EQUAL_S(setsize, cpusetp1, cpusetp2) \
-  (__extension__							      \
-   ({ const __cpu_mask *__arr1 = (cpusetp1)->__bits;			      \
-      const __cpu_mask *__arr2 = (cpusetp2)->__bits;			      \
-      size_t __imax = (setsize) / sizeof (__cpu_mask);			      \
-      size_t __i;							      \
-      for (__i = 0; __i < __imax; ++__i)				      \
-	if (__arr1[__i] != __arr2[__i])					      \
-	  break;							      \
-      __i == __imax; }))
-#endif
-#endif // __CPU_EQUAL_S
-
-#define CPU_EQUAL(cpusetp1, cpusetp2) \
-  __CPU_EQUAL_S (sizeof (cpu_set_t), cpusetp1, cpusetp2)
-#endif // CPU_EQUAL
-
-#ifndef CPU_OR
-#ifndef __CPU_OP_S
-#define __CPU_OP_S(setsize, destset, srcset1, srcset2, op) \
-  (__extension__							      \
-   ({ cpu_set_t *__dest = (destset);					      \
-      const __cpu_mask *__arr1 = (srcset1)->__bits;			      \
-      const __cpu_mask *__arr2 = (srcset2)->__bits;			      \
-      size_t __imax = (setsize) / sizeof (__cpu_mask);			      \
-      size_t __i;							      \
-      for (__i = 0; __i < __imax; ++__i)				      \
-	((__cpu_mask *) __dest->__bits)[__i] = __arr1[__i] op __arr2[__i];    \
-      __dest; }))
-#endif // __CPU_OP_S
-
-#define CPU_OR(destset, srcset1, srcset2) \
-  __CPU_OP_S (sizeof (cpu_set_t), destset, srcset1, srcset2, |)
-#endif // CPU_OR
-
-#endif // _CPU_SET_SCHED_PERF_H

From 6c478e7b3eba3f387a2d6c749e3e3ee0f8ad1c53 Mon Sep 17 00:00:00 2001
From: Mike Leach <mike.leach@arm.com>
Date: Wed, 18 Mar 2026 10:36:39 +0000
Subject: [PATCH 031/131] perf: tools: cs-etm: Fix print issue for Coresight
 debug in ETE/TRBE trace

Building perf with CORESIGHT=1 and the optional CSTRACE_RAW=1 enables
additional debug printing of raw trace data when using command:-
perf report --dump.

This raw trace prints the CoreSight formatted trace frames, which may be
used to investigate suspected issues with trace quality / corruption /
decode.

These frames are not present in ETE + TRBE trace.
This fix removes the unnecessary call to print these frames.

This fix also rationalises implementation - original code had helper
function that unnecessarily repeated initialisation calls that had
already been made.

Due to an addtional fault with the OpenCSD library, this call when ETE/TRBE
are being decoded will cause a segfault in perf. This fix also prevents
that problem for perf using older (<= 1.8.0 version) OpenCSD libraries.

Fixes: 68ffe3902898 ("perf tools: Add decoder mechanic to support dumping trace data")
Reported-by: Leo Yan <leo.yan@arm.com>
Signed-off-by: Mike Leach <mike.leach@arm.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 .../perf/util/cs-etm-decoder/cs-etm-decoder.c | 51 +++++--------------
 1 file changed, 13 insertions(+), 38 deletions(-)

diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
index 3050fe212666..8592a778b26a 100644
--- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
+++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
@@ -237,46 +237,24 @@ cs_etm_decoder__init_def_logger_printing(struct cs_etm_decoder_params *d_params,
 					      (void *)decoder,
 					      cs_etm_decoder__print_str_cb);
 	if (ret != 0)
-		ret = -1;
-
-	return 0;
-}
+		return -1;
 
 #ifdef CS_LOG_RAW_FRAMES
-static void
-cs_etm_decoder__init_raw_frame_logging(struct cs_etm_decoder_params *d_params,
-				       struct cs_etm_decoder *decoder)
-{
-	/* Only log these during a --dump operation */
-	if (d_params->operation == CS_ETM_OPERATION_PRINT) {
-		/* set up a library default logger to process the
-		 *  raw frame printer we add later
-		 */
-		ocsd_def_errlog_init(OCSD_ERR_SEV_ERROR, 1);
-
-		/* no stdout / err / file output */
-		ocsd_def_errlog_config_output(C_API_MSGLOGOUT_FLG_NONE, NULL);
-
-		/* set the string CB for the default logger,
-		 * passes strings to perf print logger.
-		 */
-		ocsd_def_errlog_set_strprint_cb(decoder->dcd_tree,
-						(void *)decoder,
-						cs_etm_decoder__print_str_cb);
-
+	/*
+	 * Only log raw frames if --dump operation and hardware is actually
+	 * generating formatted CoreSight trace frames
+	 */
+	if ((d_params->operation == CS_ETM_OPERATION_PRINT) &&
+	    (d_params->formatted == true)) {
 		/* use the built in library printer for the raw frames */
-		ocsd_dt_set_raw_frame_printer(decoder->dcd_tree,
-					      CS_RAW_DEBUG_FLAGS);
+		ret = ocsd_dt_set_raw_frame_printer(decoder->dcd_tree,
+						    CS_RAW_DEBUG_FLAGS);
+		if (ret != 0)
+			return -1;
 	}
-}
-#else
-static void
-cs_etm_decoder__init_raw_frame_logging(
-		struct cs_etm_decoder_params *d_params __maybe_unused,
-		struct cs_etm_decoder *decoder __maybe_unused)
-{
-}
 #endif
+	return 0;
+}
 
 static ocsd_datapath_resp_t
 cs_etm_decoder__do_soft_timestamp(struct cs_etm_queue *etmq,
@@ -738,9 +716,6 @@ cs_etm_decoder__new(int decoders, struct cs_etm_decoder_params *d_params,
 	if (ret != 0)
 		goto err_free_decoder;
 
-	/* init raw frame logging if required */
-	cs_etm_decoder__init_raw_frame_logging(d_params, decoder);
-
 	for (i = 0; i < decoders; i++) {
 		ret = cs_etm_decoder__create_etm_decoder(d_params,
 							 &t_params[i],

From 35cd0098eeb9601844cb82c4402fa7e6576c8b01 Mon Sep 17 00:00:00 2001
From: Mike Leach <mike.leach@arm.com>
Date: Wed, 18 Mar 2026 10:36:40 +0000
Subject: [PATCH 032/131] perf: tools: cs-etm: Enhance raw Coresight trace
 debug display

When compiling perf with CORESIGHT=1, an additional build option may
be used: CSTRACE_RAW=1, which will cause the CoreSight formatted trace
frames to be printed out during a perf --dump command. This is useful
when investigating issues with trace generation, decode or possible
data corruption.

e.g. for ETMv4 trace source into a formatted ETR sink a dump -

. ... CoreSight ETMV4I Trace data: size 0x28c150 bytes
Idx:0; ID:14;	I_ASYNC : Alignment Synchronisation.
Idx:12; ID:14;	I_TRACE_INFO : Trace Info.; INFO=0x0 { CC.0 };
                Decoder Sync point TINFO
Idx:17; ID:14;	I_ADDR_L_64IS0 : Address, Long, 64 bit, IS0.;
                Addr=0x0000000000000000;

becomes with CSTRACE_RAW=1:

. ... CoreSight ETMV4I Trace data: size 0x28c150 bytes
Frame Data; Index 0; ID_DATA[0x14]; 00 00 00 00 00 00 00 00
                                    00 00 00 80 01 01
Idx:0; ID:14;	I_ASYNC : Alignment Synchronisation.
Frame Data; Index 16; ID_DATA[0x14]; 00 9d 00 00 00 00 00 00
                                     00 00 04 85 57 08 f2
Idx:12; ID:14;	I_TRACE_INFO : Trace Info.; INFO=0x0 { CC.0 };
                Decoder Sync point TINFO
Idx:17; ID:14;	I_ADDR_L_64IS0 : Address, Long, 64 bit, IS0.;
                Addr=0x0000000000000000;

CSTRACE_RAW=1 has no effect on ETE + TRBE trace as there is no trace
formatting in the TRBE buffer.

This patch enhances the output so that for each packet the individual
bytes associated with the packet are printed.

Thus for ETMv4 this now becomes:

. ... CoreSight ETMV4I Trace data: size 0x28c150 bytes
Frame Data; Index 0; ID_DATA[0x14]; 00 00 00 00 00 00 00 00
                                    00 00 00 80 01 01
Idx:0; ID:14;[0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x80];
             I_ASYNC : Alignment Synchronisation.
Frame Data; Index 16; ID_DATA[0x14]; 00 9d 00 00 00 00 00 00
                                     00 00 04 85 57 08 f2
Idx:12; ID:14; [0x01 0x01 0x00 ]; I_TRACE_INFO : Trace Info.; INFO=0x0
                                  { CC.0 }; Decoder Sync point TINFO
Idx:17; ID:14; [0x9d 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 ];
               I_ADDR_L_64IS0 : Address, Long, 64 bit, IS0.;
	       Addr=0x0000000000000000;

ETE trace output changes from:

Idx:0; ID:14; I_ASYNC : Alignment Synchronisation.
Idx:12; ID:14; I_TRACE_INFO : Trace Info.; INFO=0x0 { CC.0, TSTATE.0 };
               Decoder Sync point TINFO
Idx:15; ID:14; I_ADDR_L_64IS0 : Address, Long, 64 bit, IS0.;
               Addr=0xFFFF80007CF7F56C;
becoming:

Idx:0; ID:14;[0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x80];
              I_ASYNC : Alignment Synchronisation.
Idx:12; ID:14; [0x01 0x01 0x00 ]; I_TRACE_INFO : Trace Info.; INFO=0x0
               { CC.0, TSTATE.0 }; Decoder Sync point TINFO
Idx:15; ID:14; [0x9d 0x5b 0x7a 0xf7 0x7c 0x00 0x80 0xff 0xff ];
               I_ADDR_L_64IS0 : Address, Long, 64 bit, IS0.;
	       Addr=0xFFFF80007CF7F56C;

Tested-by: Leo Yan <leo.yan@arm.com>
Signed-off-by: Mike Leach <mike.leach@arm.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/cs-etm-decoder/cs-etm-decoder.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
index 8592a778b26a..91d3feb18aaf 100644
--- a/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
+++ b/tools/perf/util/cs-etm-decoder/cs-etm-decoder.c
@@ -22,12 +22,15 @@
 /* use raw logging */
 #ifdef CS_DEBUG_RAW
 #define CS_LOG_RAW_FRAMES
+#define CS_PKT_MON	1
 #ifdef CS_RAW_PACKED
 #define CS_RAW_DEBUG_FLAGS (OCSD_DFRMTR_UNPACKED_RAW_OUT | \
 			    OCSD_DFRMTR_PACKED_RAW_OUT)
 #else
 #define CS_RAW_DEBUG_FLAGS (OCSD_DFRMTR_UNPACKED_RAW_OUT)
 #endif
+#else
+#define CS_PKT_MON	0
 #endif
 
 /*
@@ -664,7 +667,7 @@ cs_etm_decoder__create_etm_decoder(struct cs_etm_decoder_params *d_params,
 					   trace_config, &csid))
 			return -1;
 
-		if (ocsd_dt_set_pkt_protocol_printer(decoder->dcd_tree, csid, 0))
+		if (ocsd_dt_set_pkt_protocol_printer(decoder->dcd_tree, csid, CS_PKT_MON))
 			return -1;
 
 		return 0;

From ebbc5ce26eca294cf5f4e63399de63d086900442 Mon Sep 17 00:00:00 2001
From: Chen Ni <nichen@iscas.ac.cn>
Date: Wed, 18 Mar 2026 12:04:22 +0800
Subject: [PATCH 033/131] perf tools: Remove duplicate include of debug.h

Remove duplicate inclusion of debug.h in symbol.c to clean up redundant
code.

Signed-off-by: Chen Ni <nichen@iscas.ac.cn>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/symbol.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index bd811b2b7890..ce9195717f44 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -26,7 +26,6 @@
 #include "demangle-rust-v0.h"
 #include "dso.h"
 #include "util.h" // lsdir()
-#include "debug.h"
 #include "event.h"
 #include "machine.h"
 #include "map.h"

From 4138987f8a90574f4d5881afa5db4c5f78553811 Mon Sep 17 00:00:00 2001
From: Chen Ni <nichen@iscas.ac.cn>
Date: Wed, 18 Mar 2026 11:49:32 +0800
Subject: [PATCH 034/131] perf tools: Remove duplicate include of stat.h

Remove duplicate inclusion of stat.h in intel-tpebs.c to clean up
redundant code.

Signed-off-by: Chen Ni <nichen@iscas.ac.cn>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/intel-tpebs.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/perf/util/intel-tpebs.c b/tools/perf/util/intel-tpebs.c
index 3c958d738ca6..2af5455488b2 100644
--- a/tools/perf/util/intel-tpebs.c
+++ b/tools/perf/util/intel-tpebs.c
@@ -22,7 +22,6 @@
 #include "tool.h"
 #include "cpumap.h"
 #include "metricgroup.h"
-#include "stat.h"
 #include <sys/stat.h>
 #include <sys/file.h>
 #include <errno.h>

From 616cd6047cbf736d93808f652086dd10a836005f Mon Sep 17 00:00:00 2001
From: Chen Pei <cp0613@linux.alibaba.com>
Date: Tue, 17 Mar 2026 11:48:47 +0800
Subject: [PATCH 035/131] perf symbol: Add RISCV case in get_plt_sizes

According to RISC-V psABI specification, the PLT (Program Linkage Table)
has the following layout:
- The first PLT entry occupies two 16-byte entries (32 bytes total)
- Subsequent PLT entries take up 16 bytes each

This aligns with the binutils-gdb implementation which defines the same
PLT sizes for RISC-V architecture.

Update get_plt_sizes() to set plt_header_size=32 and plt_entry_size=16
for EM_RISCV, matching the architecture's standard ABI.

Since AARCH64, LOONGARCH, and RISCV have the same PLT size definition,
they are merged together.

Link: https://github.com/riscv-non-isa/riscv-elf-psabi-doc
Link: https://sourceware.org/git/?p=binutils-gdb.git;a=blob;f=bfd/elfnn-riscv.c

Signed-off-by: Chen Pei <cp0613@linux.alibaba.com>
Reviewed-by: Guo Ren <guoren@kernel.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/symbol-elf.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index d7582dbf379e..3cd4e5a03cc5 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -372,10 +372,8 @@ static bool get_plt_sizes(struct dso *dso, GElf_Ehdr *ehdr, GElf_Shdr *shdr_plt,
 		*plt_entry_size = 12;
 		return true;
 	case EM_AARCH64:
-		*plt_header_size = 32;
-		*plt_entry_size = 16;
-		return true;
 	case EM_LOONGARCH:
+	case EM_RISCV:
 		*plt_header_size = 32;
 		*plt_entry_size = 16;
 		return true;

From 30b2e6fa58f3b9eff86fb851a8926bf814d82dcd Mon Sep 17 00:00:00 2001
From: Zecheng Li <zli94@ncsu.edu>
Date: Mon, 9 Mar 2026 13:55:14 -0400
Subject: [PATCH 036/131] perf dwarf-aux: Add die_get_pointer_type to get
 pointer types

When a variable type is wrapped in typedef/qualifiers, callers may need
to first resolve it to the underlying DW_TAG_pointer_type or
DW_TAG_array_type. A simple tag check is not enough and directly calling
__die_get_real_type() can stop at the pointer type (e.g. typedef ->
pointer) instead of the pointee type.

Add die_get_pointer_type() helper that follows typedef/qualifier chains
and returns the underlying pointer DIE. Use it in annotate-data.c so
pointer checks and dereference work correctly for typedef'd pointers.

Signed-off-by: Zecheng Li <zli94@ncsu.edu>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/annotate-data.c | 39 +++++++++++++++++++--------------
 tools/perf/util/dwarf-aux.c     | 27 +++++++++++++++++++++++
 tools/perf/util/dwarf-aux.h     |  2 ++
 3 files changed, 51 insertions(+), 17 deletions(-)

diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index 44fbd41e3845..cda020ea18d5 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -455,13 +455,6 @@ static const char *match_result_str(enum type_match_result tmr)
 	}
 }
 
-static bool is_pointer_type(Dwarf_Die *type_die)
-{
-	int tag = dwarf_tag(type_die);
-
-	return tag == DW_TAG_pointer_type || tag == DW_TAG_array_type;
-}
-
 static bool is_compound_type(Dwarf_Die *type_die)
 {
 	int tag = dwarf_tag(type_die);
@@ -474,19 +467,24 @@ static bool is_better_type(Dwarf_Die *type_a, Dwarf_Die *type_b)
 {
 	Dwarf_Word size_a, size_b;
 	Dwarf_Die die_a, die_b;
+	Dwarf_Die ptr_a, ptr_b;
+	Dwarf_Die *ptr_type_a, *ptr_type_b;
+
+	ptr_type_a = die_get_pointer_type(type_a, &ptr_a);
+	ptr_type_b = die_get_pointer_type(type_b, &ptr_b);
 
 	/* pointer type is preferred */
-	if (is_pointer_type(type_a) != is_pointer_type(type_b))
-		return is_pointer_type(type_b);
+	if ((ptr_type_a != NULL) != (ptr_type_b != NULL))
+		return ptr_type_b != NULL;
 
-	if (is_pointer_type(type_b)) {
+	if (ptr_type_b) {
 		/*
 		 * We want to compare the target type, but 'void *' can fail to
 		 * get the target type.
 		 */
-		if (die_get_real_type(type_a, &die_a) == NULL)
+		if (die_get_real_type(ptr_type_a, &die_a) == NULL)
 			return true;
-		if (die_get_real_type(type_b, &die_b) == NULL)
+		if (die_get_real_type(ptr_type_b, &die_b) == NULL)
 			return false;
 
 		type_a = &die_a;
@@ -539,7 +537,7 @@ static enum type_match_result check_variable(struct data_loc_info *dloc,
 	 * and local variables are accessed directly without a pointer.
 	 */
 	if (needs_pointer) {
-		if (!is_pointer_type(type_die) ||
+		if (die_get_pointer_type(type_die, type_die) == NULL ||
 		    __die_get_real_type(type_die, type_die) == NULL)
 			return PERF_TMR_NO_POINTER;
 	}
@@ -880,12 +878,16 @@ static void update_var_state(struct type_state *state, struct data_loc_info *dlo
 			continue;
 
 		if (var->reg == DWARF_REG_FB || var->reg == fbreg || var->reg == state->stack_reg) {
+			Dwarf_Die ptr_die;
+			Dwarf_Die *ptr_type;
 			int offset = var->offset;
 			struct type_state_stack *stack;
 
+			ptr_type = die_get_pointer_type(&mem_die, &ptr_die);
+
 			/* If the reg location holds the pointer value, dereference the type */
-			if (!var->is_reg_var_addr && is_pointer_type(&mem_die) &&
-				__die_get_real_type(&mem_die, &mem_die) == NULL)
+			if (!var->is_reg_var_addr && ptr_type &&
+			    __die_get_real_type(ptr_type, &mem_die) == NULL)
 				continue;
 
 			if (var->reg != DWARF_REG_FB)
@@ -1110,7 +1112,9 @@ again:
 		goto check_non_register;
 
 	if (state->regs[reg].kind == TSR_KIND_TYPE) {
+		Dwarf_Die ptr_die;
 		Dwarf_Die sized_type;
+		Dwarf_Die *ptr_type;
 		struct strbuf sb;
 
 		strbuf_init(&sb, 32);
@@ -1122,7 +1126,8 @@ again:
 		 * Normal registers should hold a pointer (or array) to
 		 * dereference a memory location.
 		 */
-		if (!is_pointer_type(&state->regs[reg].type)) {
+		ptr_type = die_get_pointer_type(&state->regs[reg].type, &ptr_die);
+		if (!ptr_type) {
 			if (dloc->op->offset < 0 && reg != state->stack_reg)
 				goto check_kernel;
 
@@ -1130,7 +1135,7 @@ again:
 		}
 
 		/* Remove the pointer and get the target type */
-		if (__die_get_real_type(&state->regs[reg].type, type_die) == NULL)
+		if (__die_get_real_type(ptr_type, type_die) == NULL)
 			return PERF_TMR_NO_POINTER;
 
 		dloc->type_offset = dloc->op->offset + state->regs[reg].offset;
diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c
index 9267af204c7d..38142062d6e5 100644
--- a/tools/perf/util/dwarf-aux.c
+++ b/tools/perf/util/dwarf-aux.c
@@ -303,6 +303,33 @@ Dwarf_Die *die_get_real_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem)
 	return vr_die;
 }
 
+/**
+ * die_get_pointer_type - Get a pointer/array type die
+ * @type_die: a DIE of a type
+ * @die_mem: where to store a type DIE
+ *
+ * Get a pointer/array type DIE from @type_die. If the type is a typedef or
+ * qualifier (const, volatile, etc.), follow the chain to find the underlying
+ * pointer type.
+ */
+Dwarf_Die *die_get_pointer_type(Dwarf_Die *type_die, Dwarf_Die *die_mem)
+{
+	int tag;
+
+	do {
+		tag = dwarf_tag(type_die);
+		if (tag == DW_TAG_pointer_type || tag == DW_TAG_array_type)
+			return type_die;
+		if (tag != DW_TAG_typedef && tag != DW_TAG_const_type &&
+		    tag != DW_TAG_restrict_type && tag != DW_TAG_volatile_type &&
+		    tag != DW_TAG_shared_type)
+			return NULL;
+		type_die = die_get_type(type_die, die_mem);
+	} while (type_die);
+
+	return NULL;
+}
+
 /* Get attribute and translate it as a udata */
 static int die_get_attr_udata(Dwarf_Die *tp_die, unsigned int attr_name,
 			      Dwarf_Word *result)
diff --git a/tools/perf/util/dwarf-aux.h b/tools/perf/util/dwarf-aux.h
index cd481ec9c5a1..99d2735122d5 100644
--- a/tools/perf/util/dwarf-aux.h
+++ b/tools/perf/util/dwarf-aux.h
@@ -60,6 +60,8 @@ Dwarf_Die *die_get_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem);
 Dwarf_Die *__die_get_real_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem);
 /* Get a type die, but skip qualifiers and typedef */
 Dwarf_Die *die_get_real_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem);
+/* Get a pointer/array type, following typedefs/qualifiers */
+Dwarf_Die *die_get_pointer_type(Dwarf_Die *type_die, Dwarf_Die *die_mem);
 
 /* Check whether the DIE is signed or not */
 bool die_is_signed_type(Dwarf_Die *tp_die);

From ace16303179efad4e1a2aebb27a661e5d1e7277d Mon Sep 17 00:00:00 2001
From: Zecheng Li <zecheng@google.com>
Date: Mon, 9 Mar 2026 13:55:15 -0400
Subject: [PATCH 037/131] perf dwarf-aux: Preserve typedefs in match_var_offset

Preserve typedefs in match_var_offset to match the results by
__die_get_real_type. Also move the (offset == 0) branch after the
is_pointer check to ensure the correct type is used, fixing cases where
an incorrect pointer type was chosen when the access offset was 0.

Signed-off-by: Zecheng Li <zecheng@google.com>
Signed-off-by: Zecheng Li <zli94@ncsu.edu>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/dwarf-aux.c | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c
index 38142062d6e5..3b0fc9038f19 100644
--- a/tools/perf/util/dwarf-aux.c
+++ b/tools/perf/util/dwarf-aux.c
@@ -1405,6 +1405,8 @@ struct find_var_data {
 	Dwarf_Addr addr;
 	/* Target register */
 	unsigned reg;
+	/* Access data type */
+	Dwarf_Die type;
 	/* Access offset, set for global data */
 	int offset;
 	/* True if the current register is the frame base */
@@ -1417,29 +1419,31 @@ struct find_var_data {
 static bool match_var_offset(Dwarf_Die *die_mem, struct find_var_data *data,
 			     s64 addr_offset, s64 addr_type, bool is_pointer)
 {
-	Dwarf_Die type_die;
 	Dwarf_Word size;
+	Dwarf_Die ptr_die;
+	Dwarf_Die *ptr_type;
 	s64 offset = addr_offset - addr_type;
 
+	if (offset < 0)
+		return false;
+
+	if (__die_get_real_type(die_mem, &data->type) == NULL)
+		return false;
+
+	ptr_type = die_get_pointer_type(&data->type, &ptr_die);
+	if (is_pointer && ptr_type) {
+		/* Get the target type of the pointer */
+		if (__die_get_real_type(ptr_type, &data->type) == NULL)
+			return false;
+	}
+
 	if (offset == 0) {
 		/* Update offset relative to the start of the variable */
 		data->offset = 0;
 		return true;
 	}
 
-	if (offset < 0)
-		return false;
-
-	if (die_get_real_type(die_mem, &type_die) == NULL)
-		return false;
-
-	if (is_pointer && dwarf_tag(&type_die) == DW_TAG_pointer_type) {
-		/* Get the target type of the pointer */
-		if (die_get_real_type(&type_die, &type_die) == NULL)
-			return false;
-	}
-
-	if (dwarf_aggregate_size(&type_die, &size) < 0)
+	if (dwarf_aggregate_size(&data->type, &size) < 0)
 		return false;
 
 	if ((u64)offset >= size)

From 8b8d8b8f17dfa817e4e94ce4e8f26d92f6f65504 Mon Sep 17 00:00:00 2001
From: Zecheng Li <zecheng@google.com>
Date: Mon, 9 Mar 2026 13:55:16 -0400
Subject: [PATCH 038/131] perf dwarf-aux: Skip check_variable for variable
 lookup

Both die_find_variable_by_reg and die_find_variable_by_addr call
match_var_offset which already performs sufficient checking and type
matching. The additional check_variable call is redundant, and its
need_pointer logic is only a heuristic. Since DWARF encodes accurate
type information, which match_var_offset verifies, skipping
check_variable improves both coverage and accuracy.

Return the matched type from die_find_variable_by_reg and
die_find_variable_by_addr via the existing `type` field in
find_var_data, removing the need for check_variable in
find_data_type_die.

Signed-off-by: Zecheng Li <zecheng@google.com>
Signed-off-by: Zecheng Li <zli94@ncsu.edu>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/annotate-data.c | 42 ++++++++++++++-------------------
 tools/perf/util/dwarf-aux.c     | 13 ++++++----
 tools/perf/util/dwarf-aux.h     |  5 ++--
 3 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index cda020ea18d5..23a09bf58f86 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -814,9 +814,8 @@ bool get_global_var_type(Dwarf_Die *cu_die, struct data_loc_info *dloc,
 	}
 
 	/* Try to get the variable by address first */
-	if (die_find_variable_by_addr(cu_die, var_addr, &var_die, &offset) &&
-	    check_variable(dloc, &var_die, type_die, DWARF_REG_PC, offset,
-			   /*is_fbreg=*/false) == PERF_TMR_OK) {
+	if (die_find_variable_by_addr(cu_die, var_addr, &var_die, type_die,
+				      &offset)) {
 		var_name = dwarf_diename(&var_die);
 		*var_offset = offset;
 		goto ok;
@@ -1606,12 +1605,13 @@ retry:
 
 		if (reg == DWARF_REG_PC) {
 			if (!die_find_variable_by_addr(&scopes[i], dloc->var_addr,
-						       &var_die, &type_offset))
+						       &var_die, &mem_die,
+						       &type_offset))
 				continue;
 		} else {
 			/* Look up variables/parameters in this scope */
 			if (!die_find_variable_by_reg(&scopes[i], pc, reg,
-						      &type_offset, is_fbreg, &var_die))
+						      &mem_die, &type_offset, is_fbreg, &var_die))
 				continue;
 		}
 
@@ -1619,26 +1619,20 @@ retry:
 			     dwarf_diename(&var_die), (long)dwarf_dieoffset(&var_die),
 			     i+1, nr_scopes, (long)dwarf_dieoffset(&scopes[i]));
 
-		/* Found a variable, see if it's correct */
-		result = check_variable(dloc, &var_die, &mem_die, reg, type_offset, is_fbreg);
-		if (result == PERF_TMR_OK) {
-			if (reg == DWARF_REG_PC) {
-				pr_debug_dtp("addr=%#"PRIx64" type_offset=%#x\n",
-					     dloc->var_addr, type_offset);
-			} else if (reg == DWARF_REG_FB || is_fbreg) {
-				pr_debug_dtp("stack_offset=%#x type_offset=%#x\n",
-					     fb_offset, type_offset);
-			} else {
-				pr_debug_dtp("type_offset=%#x\n", type_offset);
-			}
-
-			if (!found || is_better_type(type_die, &mem_die)) {
-				*type_die = mem_die;
-				dloc->type_offset = type_offset;
-				found = true;
-			}
+		if (reg == DWARF_REG_PC) {
+			pr_debug_dtp("addr=%#"PRIx64" type_offset=%#x\n",
+				     dloc->var_addr, type_offset);
+		} else if (reg == DWARF_REG_FB || is_fbreg) {
+			pr_debug_dtp("stack_offset=%#x type_offset=%#x\n",
+				     fb_offset, type_offset);
 		} else {
-			pr_debug_dtp("failed: %s\n", match_result_str(result));
+			pr_debug_dtp("type_offset=%#x\n", type_offset);
+		}
+
+		if (!found || is_better_type(type_die, &mem_die)) {
+			*type_die = mem_die;
+			dloc->type_offset = type_offset;
+			found = true;
 		}
 
 		pr_debug_location(&var_die, pc, reg);
diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c
index 3b0fc9038f19..1484aa756826 100644
--- a/tools/perf/util/dwarf-aux.c
+++ b/tools/perf/util/dwarf-aux.c
@@ -1560,7 +1560,7 @@ static int __die_find_var_reg_cb(Dwarf_Die *die_mem, void *arg)
  * when the variable is in the stack.
  */
 Dwarf_Die *die_find_variable_by_reg(Dwarf_Die *sc_die, Dwarf_Addr pc, int reg,
-				    int *poffset, bool is_fbreg,
+				    Dwarf_Die *type_die, int *poffset, bool is_fbreg,
 				    Dwarf_Die *die_mem)
 {
 	struct find_var_data data = {
@@ -1572,8 +1572,10 @@ Dwarf_Die *die_find_variable_by_reg(Dwarf_Die *sc_die, Dwarf_Addr pc, int reg,
 	Dwarf_Die *result;
 
 	result = die_find_child(sc_die, __die_find_var_reg_cb, &data, die_mem);
-	if (result)
+	if (result) {
 		*poffset = data.offset;
+		*type_die = data.type;
+	}
 	return result;
 }
 
@@ -1617,7 +1619,8 @@ static int __die_find_var_addr_cb(Dwarf_Die *die_mem, void *arg)
  * This is usually for global variables.
  */
 Dwarf_Die *die_find_variable_by_addr(Dwarf_Die *sc_die, Dwarf_Addr addr,
-				     Dwarf_Die *die_mem, int *offset)
+				     Dwarf_Die *die_mem, Dwarf_Die *type_die,
+				     int *offset)
 {
 	struct find_var_data data = {
 		.addr = addr,
@@ -1625,8 +1628,10 @@ Dwarf_Die *die_find_variable_by_addr(Dwarf_Die *sc_die, Dwarf_Addr addr,
 	Dwarf_Die *result;
 
 	result = die_find_child(sc_die, __die_find_var_addr_cb, &data, die_mem);
-	if (result)
+	if (result) {
 		*offset = data.offset;
+		*type_die = data.type;
+	}
 	return result;
 }
 
diff --git a/tools/perf/util/dwarf-aux.h b/tools/perf/util/dwarf-aux.h
index 99d2735122d5..939a59c91796 100644
--- a/tools/perf/util/dwarf-aux.h
+++ b/tools/perf/util/dwarf-aux.h
@@ -165,12 +165,13 @@ int die_get_var_range(Dwarf_Die *sp_die, Dwarf_Die *vr_die, struct strbuf *buf);
 
 /* Find a variable saved in the 'reg' at given address */
 Dwarf_Die *die_find_variable_by_reg(Dwarf_Die *sc_die, Dwarf_Addr pc, int reg,
-				    int *poffset, bool is_fbreg,
+				    Dwarf_Die *type_die, int *poffset, bool is_fbreg,
 				    Dwarf_Die *die_mem);
 
 /* Find a (global) variable located in the 'addr' */
 Dwarf_Die *die_find_variable_by_addr(Dwarf_Die *sc_die, Dwarf_Addr addr,
-				     Dwarf_Die *die_mem, int *offset);
+				     Dwarf_Die *die_mem, Dwarf_Die *type_die,
+				     int *offset);
 
 /* Save all variables and parameters in this scope */
 void die_collect_vars(Dwarf_Die *sc_die, struct die_var_type **var_types);

From 69953f9c65856fc9438fc2ad4b9fd8255a2e47da Mon Sep 17 00:00:00 2001
From: Zecheng Li <zecheng@google.com>
Date: Mon, 9 Mar 2026 13:55:17 -0400
Subject: [PATCH 039/131] perf annotate-data: Improve type comparison from
 different scopes

When comparing types from different scopes, first compare their type
offsets. A larger offset means the field belongs to an outer (enclosing)
struct. This helps resolve cases where a pointer is found in an inner
scope, but a struct containing that pointer exists in an outer scope.
Previously, is_better_type would prefer the pointer type, but the struct
type is actually more complete and should be chosen.

Prefer types from outer scopes when is_better_type cannot determine a
better type. This is a heuristic for the case `struct A { struct B; }`
where A and B have the same size but I think in most cases A is in the
outer scope and should be preferred.

Signed-off-by: Zecheng Li <zecheng@google.com>
Signed-off-by: Zecheng Li <zli94@ncsu.edu>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/annotate-data.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index 23a09bf58f86..6fe2efd48a83 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -1629,7 +1629,9 @@ retry:
 			pr_debug_dtp("type_offset=%#x\n", type_offset);
 		}
 
-		if (!found || is_better_type(type_die, &mem_die)) {
+		if (!found || dloc->type_offset < type_offset ||
+		    (dloc->type_offset == type_offset &&
+		     !is_better_type(&mem_die, type_die))) {
 			*type_die = mem_die;
 			dloc->type_offset = type_offset;
 			found = true;

From 6ffc3d0d3db5fb6c88fcb69eb355e9cc839a860c Mon Sep 17 00:00:00 2001
From: Zecheng Li <zli94@ncsu.edu>
Date: Mon, 9 Mar 2026 13:55:18 -0400
Subject: [PATCH 040/131] perf dwarf-aux: Handle array types in
 die_get_member_type

When a struct member is an array type, die_get_member_type() would stop
iterating since array types weren't handled in the loop. This caused
accesses to array elements within structs to not resolve properly.

Add array type handling by resolving the array to its element type and
calculating the offset within an element using modulo arithmetic

This improves type annotation coverage for struct members that are
arrays.

Signed-off-by: Zecheng Li <zli94@ncsu.edu>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/dwarf-aux.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c
index 1484aa756826..1feefc329154 100644
--- a/tools/perf/util/dwarf-aux.c
+++ b/tools/perf/util/dwarf-aux.c
@@ -2127,13 +2127,28 @@ Dwarf_Die *die_get_member_type(Dwarf_Die *type_die, int offset,
 
 		tag = dwarf_tag(&mb_type);
 
-		if (tag == DW_TAG_structure_type || tag == DW_TAG_union_type) {
+		if (tag == DW_TAG_structure_type || tag == DW_TAG_union_type ||
+		    tag == DW_TAG_array_type) {
 			Dwarf_Word loc;
 
 			/* Update offset for the start of the member struct */
 			if (die_get_data_member_location(member, &loc) == 0)
 				offset -= loc;
 		}
+
+		/* Handle array types: resolve to the element type by one level */
+		if (tag == DW_TAG_array_type) {
+			Dwarf_Word size;
+
+			if (die_get_real_type(&mb_type, &mb_type) == NULL)
+				return NULL;
+
+			if (dwarf_aggregate_size(&mb_type, &size) < 0)
+				return NULL;
+
+			offset = offset % size;
+			tag = dwarf_tag(&mb_type);
+		}
 	}
 	*die_mem = mb_type;
 	return die_mem;

From 752e662ae0619721ddde6f60a84fbe3c669fc539 Mon Sep 17 00:00:00 2001
From: Zecheng Li <zli94@ncsu.edu>
Date: Mon, 9 Mar 2026 13:55:19 -0400
Subject: [PATCH 041/131] perf annotate-data: Collect global variables without
 name

Previously, global_var__collect() required get_global_var_info() to
succeed (i.e., the variable must have a symbol name) before caching a
global variable. This prevented variables that exist in DWARF but lack
symbol table coverage from being cached.

Remove the symbol table requirement since DW_OP_addr already provides
the variable's address directly from DWARF. The symbol table lookup is
now optional to obtain the variable name when available.

Also remove the var_offset != 0 check, which was intended to skip
variables where the access address doesn't match the symbol start. The
symbol table lookup is now optional and I found removing this check has
no effect on the annotation results for both kernel and userspace
programs.

Test results show improved annotation coverage especially for userspace
programs with RIP-relative addressing instructions.

Signed-off-by: Zecheng Li <zli94@ncsu.edu>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/annotate-data.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index 6fe2efd48a83..301f73ea8275 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -774,12 +774,7 @@ static void global_var__collect(struct data_loc_info *dloc)
 			if (!dwarf_offdie(dwarf, pos->die_off, &type_die))
 				continue;
 
-			if (!get_global_var_info(dloc, pos->addr, &var_name,
-						 &var_offset))
-				continue;
-
-			if (var_offset != 0)
-				continue;
+			get_global_var_info(dloc, pos->addr, &var_name, &var_offset);
 
 			global_var__add(dloc, pos->addr, var_name, &type_die);
 		}

From 1b8db0c963bf788392976bea87f0ef8d227c4930 Mon Sep 17 00:00:00 2001
From: Zecheng Li <zli94@ncsu.edu>
Date: Mon, 9 Mar 2026 13:55:20 -0400
Subject: [PATCH 042/131] perf annotate-data: Handle global variable access
 with const register

When a register holds a constant value (TSR_KIND_CONST) and is used with
a negative offset, treat it as a potential global variable access
instead of falling through to CFA (frame) handling.

This fixes cases like array indexing with computed offsets:

    movzbl -0x7d72725a(%rax), %eax   # array[%rax]

Where %rax contains a computed index and the negative offset points to a
global array. Previously this fell through to the CFA path which doesn't
handle global variables, resulting in "no type information".

The fix redirects such accesses to check_kernel which calls
get_global_var_type() to resolve the type from the global variable
cache. This is only done for kernel DSOs since the pattern relies on
kernel-specific global variable resolution. We could also treat
registers with integer types to the global variable path, but this
requires more changes.

Signed-off-by: Zecheng Li <zli94@ncsu.edu>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/annotate-data.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index 301f73ea8275..50c82c91f828 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -1229,6 +1229,11 @@ again:
 		return PERF_TMR_BAIL_OUT;
 	}
 
+	if (state->regs[reg].kind == TSR_KIND_CONST &&
+	    dso__kernel(map__dso(dloc->ms->map))) {
+		if (dloc->op->offset < 0 && reg != state->stack_reg && reg != dloc->fbreg)
+			goto check_kernel;
+	}
 check_non_register:
 	if (reg == dloc->fbreg || reg == state->stack_reg) {
 		struct type_state_stack *stack;

From 22b320777c5f496a36867f16f18870e67b123020 Mon Sep 17 00:00:00 2001
From: Zecheng Li <zli94@ncsu.edu>
Date: Mon, 9 Mar 2026 13:55:21 -0400
Subject: [PATCH 043/131] perf annotate-data: Add invalidate_reg_state() helper
 for x86

Add a helper function to consistently invalidate register state instead
of field assignments. This ensures kind, ok, and copied_from are all
properly cleared when a register becomes invalid.

The helper sets:
- kind = TSR_KIND_INVALID
- ok = false
- copied_from = -1

Replace all invalidation patterns with calls to this helper. No
functional change and this removes some incorrect annotations that were
caused by incomplete invalidation (e.g. a obsolete copied_from from an
invalidated register).

Signed-off-by: Zecheng Li <zli94@ncsu.edu>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/annotate-arch/annotate-x86.c | 29 ++++++++++++--------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/tools/perf/util/annotate-arch/annotate-x86.c b/tools/perf/util/annotate-arch/annotate-x86.c
index eb9a649ca656..eb390a253d71 100644
--- a/tools/perf/util/annotate-arch/annotate-x86.c
+++ b/tools/perf/util/annotate-arch/annotate-x86.c
@@ -204,6 +204,13 @@ static int x86__cpuid_parse(struct arch *arch, const char *cpuid)
 }
 
 #ifdef HAVE_LIBDW_SUPPORT
+static void invalidate_reg_state(struct type_state_reg *reg)
+{
+	reg->kind = TSR_KIND_INVALID;
+	reg->ok = false;
+	reg->copied_from = -1;
+}
+
 static void update_insn_state_x86(struct type_state *state,
 				  struct data_loc_info *dloc, Dwarf_Die *cu_die,
 				  struct disasm_line *dl)
@@ -235,7 +242,7 @@ static void update_insn_state_x86(struct type_state *state,
 		/* Otherwise invalidate caller-saved registers after call */
 		for (unsigned i = 0; i < ARRAY_SIZE(state->regs); i++) {
 			if (state->regs[i].caller_saved)
-				state->regs[i].ok = false;
+				invalidate_reg_state(&state->regs[i]);
 		}
 
 		/* Update register with the return type (if any) */
@@ -364,8 +371,7 @@ static void update_insn_state_x86(struct type_state *state,
 		src_tsr = state->regs[sreg];
 		tsr = &state->regs[dst->reg1];
 
-		tsr->copied_from = -1;
-		tsr->ok = false;
+		invalidate_reg_state(tsr);
 
 		/* Case 1: Based on stack pointer or frame pointer */
 		if (sreg == fbreg || sreg == state->stack_reg) {
@@ -433,8 +439,7 @@ static void update_insn_state_x86(struct type_state *state,
 		    !strncmp(dl->ins.name, "inc", 3)  || !strncmp(dl->ins.name, "dec", 3)) {
 			pr_debug_dtp("%s [%x] invalidate reg%d\n",
 						dl->ins.name, insn_offset, dst->reg1);
-			state->regs[dst->reg1].ok = false;
-			state->regs[dst->reg1].copied_from = -1;
+			invalidate_reg_state(&state->regs[dst->reg1]);
 			return;
 		}
 
@@ -496,7 +501,7 @@ static void update_insn_state_x86(struct type_state *state,
 			if (!get_global_var_type(cu_die, dloc, ip, var_addr,
 						 &offset, &type_die) ||
 			    !die_get_member_type(&type_die, offset, &type_die)) {
-				tsr->ok = false;
+				invalidate_reg_state(tsr);
 				return;
 			}
 
@@ -524,7 +529,7 @@ static void update_insn_state_x86(struct type_state *state,
 
 		if (!has_reg_type(state, src->reg1) ||
 		    !state->regs[src->reg1].ok) {
-			tsr->ok = false;
+			invalidate_reg_state(tsr);
 			return;
 		}
 
@@ -560,7 +565,7 @@ retry:
 
 			stack = find_stack_state(state, offset);
 			if (stack == NULL) {
-				tsr->ok = false;
+				invalidate_reg_state(tsr);
 				return;
 			} else if (!stack->compound) {
 				tsr->type = stack->type;
@@ -575,7 +580,7 @@ retry:
 				tsr->offset = 0;
 				tsr->ok = true;
 			} else {
-				tsr->ok = false;
+				invalidate_reg_state(tsr);
 				return;
 			}
 
@@ -628,7 +633,7 @@ retry:
 			if (!get_global_var_type(cu_die, dloc, ip, addr, &offset,
 						 &type_die) ||
 			    !die_get_member_type(&type_die, offset, &type_die)) {
-				tsr->ok = false;
+				invalidate_reg_state(tsr);
 				return;
 			}
 
@@ -679,7 +684,7 @@ retry:
 				}
 				pr_debug_type_name(&tsr->type, tsr->kind);
 			} else {
-				tsr->ok = false;
+				invalidate_reg_state(tsr);
 			}
 		}
 		/* And then dereference the calculated pointer if it has one */
@@ -721,7 +726,7 @@ retry:
 				}
 			}
 
-			tsr->ok = false;
+			invalidate_reg_state(tsr);
 		}
 	}
 	/* Case 3. register to memory transfers */

From d35b0d5877109ecca106cc3835d4d23ac2cdc33c Mon Sep 17 00:00:00 2001
From: Zecheng Li <zli94@ncsu.edu>
Date: Mon, 9 Mar 2026 13:55:22 -0400
Subject: [PATCH 044/131] perf annotate-data: Invalidate caller-saved regs for
 all calls

Previously, the x86 call handler returned early without invalidating
caller-saved registers when the call target symbol could not be resolved
(func == NULL). This violated the ABI which requires caller-saved
registers to be considered clobbered after any call instruction.

Fix this by:
1. Always invalidating caller-saved registers for any call instruction
   (except __fentry__ which preserves registers)
2. Using dl->ops.target.name as fallback when func->name is unavailable,
   allowing return type lookup for more call targets

This is a conservative change that may reduce type coverage for indirect
calls (e.g., callq *(%rax)) where we cannot determine the return type
but it ensures correctness.

Signed-off-by: Zecheng Li <zli94@ncsu.edu>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/annotate-arch/annotate-x86.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/tools/perf/util/annotate-arch/annotate-x86.c b/tools/perf/util/annotate-arch/annotate-x86.c
index eb390a253d71..df9fc0a51b39 100644
--- a/tools/perf/util/annotate-arch/annotate-x86.c
+++ b/tools/perf/util/annotate-arch/annotate-x86.c
@@ -229,24 +229,31 @@ static void update_insn_state_x86(struct type_state *state,
 
 	if (ins__is_call(&dl->ins)) {
 		struct symbol *func = dl->ops.target.sym;
+		const char *call_name;
 
-		if (func == NULL)
-			return;
+		/* Try to resolve the call target name */
+		if (func)
+			call_name = func->name;
+		else
+			call_name = dl->ops.target.name;
 
 		/* __fentry__ will preserve all registers */
-		if (!strcmp(func->name, "__fentry__"))
+		if (call_name && !strcmp(call_name, "__fentry__"))
 			return;
 
-		pr_debug_dtp("call [%x] %s\n", insn_offset, func->name);
+		if (call_name)
+			pr_debug_dtp("call [%x] %s\n", insn_offset, call_name);
+		else
+			pr_debug_dtp("call [%x] <unknown>\n", insn_offset);
 
-		/* Otherwise invalidate caller-saved registers after call */
+		/* Invalidate caller-saved registers after call (ABI requirement) */
 		for (unsigned i = 0; i < ARRAY_SIZE(state->regs); i++) {
 			if (state->regs[i].caller_saved)
 				invalidate_reg_state(&state->regs[i]);
 		}
 
 		/* Update register with the return type (if any) */
-		if (die_find_func_rettype(cu_die, func->name, &type_die)) {
+		if (call_name && die_find_func_rettype(cu_die, call_name, &type_die)) {
 			tsr = &state->regs[state->ret_reg];
 			tsr->type = type_die;
 			tsr->kind = TSR_KIND_TYPE;

From 4fb7eefe6c539840fa8854d67d00af35331b8843 Mon Sep 17 00:00:00 2001
From: Zecheng Li <zli94@ncsu.edu>
Date: Mon, 9 Mar 2026 13:55:23 -0400
Subject: [PATCH 045/131] perf annotate-data: Use DWARF location ranges to
 preserve reg state

When a function call occurs, caller-saved registers are typically
invalidated since the callee may clobber them. However, DWARF debug info
provides location ranges that indicate exactly where a variable is valid
in a register.

Track the DWARF location range end address in type_state_reg and use it
to determine if a caller-saved register should be preserved across a
call. If the current call address is within the DWARF-specified lifetime
of the variable, keep the register state valid instead of invalidating
it.

This improves type annotation for code where the compiler knows a
register value survives across calls (e.g., when the callee is known not
to clobber certain registers or when the value is reloaded after the
call at the same logical location).

Changes:
- Add `end` and `has_range` fields to die_var_type to capture DWARF
  location range information
- Add `lifetime_active` and `lifetime_end` fields to type_state_reg
- Check location lifetime before invalidating caller-saved registers

Signed-off-by: Zecheng Li <zli94@ncsu.edu>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/annotate-arch/annotate-x86.c | 25 +++++++++++++++++---
 tools/perf/util/annotate-data.c              | 24 +++++++++++++++++--
 tools/perf/util/annotate-data.h              |  3 +++
 tools/perf/util/dwarf-aux.c                  |  6 ++++-
 tools/perf/util/dwarf-aux.h                  |  2 ++
 5 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/tools/perf/util/annotate-arch/annotate-x86.c b/tools/perf/util/annotate-arch/annotate-x86.c
index df9fc0a51b39..c77aabd48eba 100644
--- a/tools/perf/util/annotate-arch/annotate-x86.c
+++ b/tools/perf/util/annotate-arch/annotate-x86.c
@@ -208,6 +208,8 @@ static void invalidate_reg_state(struct type_state_reg *reg)
 {
 	reg->kind = TSR_KIND_INVALID;
 	reg->ok = false;
+	reg->lifetime_active = false;
+	reg->lifetime_end = 0;
 	reg->copied_from = -1;
 }
 
@@ -230,6 +232,7 @@ static void update_insn_state_x86(struct type_state *state,
 	if (ins__is_call(&dl->ins)) {
 		struct symbol *func = dl->ops.target.sym;
 		const char *call_name;
+		u64 call_addr;
 
 		/* Try to resolve the call target name */
 		if (func)
@@ -246,10 +249,18 @@ static void update_insn_state_x86(struct type_state *state,
 		else
 			pr_debug_dtp("call [%x] <unknown>\n", insn_offset);
 
-		/* Invalidate caller-saved registers after call (ABI requirement) */
+		/* Invalidate caller-saved registers after call */
+		call_addr = map__rip_2objdump(dloc->ms->map,
+					      dloc->ms->sym->start + dl->al.offset);
 		for (unsigned i = 0; i < ARRAY_SIZE(state->regs); i++) {
-			if (state->regs[i].caller_saved)
-				invalidate_reg_state(&state->regs[i]);
+			struct type_state_reg *reg = &state->regs[i];
+
+			if (!reg->caller_saved)
+				continue;
+			/* Keep register valid within DWARF location lifetime */
+			if (reg->lifetime_active && call_addr < reg->lifetime_end)
+				continue;
+			invalidate_reg_state(reg);
 		}
 
 		/* Update register with the return type (if any) */
@@ -279,6 +290,8 @@ static void update_insn_state_x86(struct type_state *state,
 
 		tsr = &state->regs[dst->reg1];
 		tsr->copied_from = -1;
+		tsr->lifetime_active = false;
+		tsr->lifetime_end = 0;
 
 		if (src->imm)
 			imm_value = src->offset;
@@ -344,6 +357,8 @@ static void update_insn_state_x86(struct type_state *state,
 
 		tsr = &state->regs[dst->reg1];
 		tsr->copied_from = -1;
+		tsr->lifetime_active = false;
+		tsr->lifetime_end = 0;
 
 		if (src->imm)
 			imm_value = src->offset;
@@ -458,6 +473,8 @@ static void update_insn_state_x86(struct type_state *state,
 			state->regs[dst->reg1].kind = TSR_KIND_CONST;
 			state->regs[dst->reg1].imm_value = 0;
 			state->regs[dst->reg1].ok = true;
+			state->regs[dst->reg1].lifetime_active = false;
+			state->regs[dst->reg1].lifetime_end = 0;
 			state->regs[dst->reg1].copied_from = -1;
 			return;
 		}
@@ -544,6 +561,8 @@ static void update_insn_state_x86(struct type_state *state,
 		tsr->kind = state->regs[src->reg1].kind;
 		tsr->imm_value = state->regs[src->reg1].imm_value;
 		tsr->offset = state->regs[src->reg1].offset;
+		tsr->lifetime_active = state->regs[src->reg1].lifetime_active;
+		tsr->lifetime_end = state->regs[src->reg1].lifetime_end;
 		tsr->ok = true;
 
 		/* To copy back the variable type later (hopefully) */
diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c
index 50c82c91f828..1eff0a27237d 100644
--- a/tools/perf/util/annotate-data.c
+++ b/tools/perf/util/annotate-data.c
@@ -840,6 +840,18 @@ static bool die_is_same(Dwarf_Die *die_a, Dwarf_Die *die_b)
 	return (die_a->cu == die_b->cu) && (die_a->addr == die_b->addr);
 }
 
+static void tsr_set_lifetime(struct type_state_reg *tsr,
+			     const struct die_var_type *var)
+{
+	if (var && var->has_range && var->end > var->addr) {
+		tsr->lifetime_active = true;
+		tsr->lifetime_end = var->end;
+	} else {
+		tsr->lifetime_active = false;
+		tsr->lifetime_end = 0;
+	}
+}
+
 /**
  * update_var_state - Update type state using given variables
  * @state: type state table
@@ -865,8 +877,14 @@ static void update_var_state(struct type_state *state, struct data_loc_info *dlo
 	}
 
 	for (var = var_types; var != NULL; var = var->next) {
-		if (var->addr != addr)
-			continue;
+		/* Check if addr falls within the variable's valid range */
+		if (var->has_range) {
+			if (addr < var->addr || (var->end && addr >= var->end))
+				continue;
+		} else {
+			if (addr != var->addr)
+				continue;
+		}
 		/* Get the type DIE using the offset */
 		if (!dwarf_offdie(dloc->di->dbg, var->die_off, &mem_die))
 			continue;
@@ -923,6 +941,7 @@ static void update_var_state(struct type_state *state, struct data_loc_info *dlo
 				reg->type = mem_die;
 				reg->kind = TSR_KIND_POINTER;
 				reg->ok = true;
+				tsr_set_lifetime(reg, var);
 
 				pr_debug_dtp("var [%"PRIx64"] reg%d addr offset %x",
 					     insn_offset, var->reg, var->offset);
@@ -939,6 +958,7 @@ static void update_var_state(struct type_state *state, struct data_loc_info *dlo
 			reg->type = mem_die;
 			reg->kind = TSR_KIND_TYPE;
 			reg->ok = true;
+			tsr_set_lifetime(reg, var);
 
 			pr_debug_dtp("var [%"PRIx64"] reg%d offset %x",
 				     insn_offset, var->reg, var->offset);
diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h
index 9b222869e42d..c26130744260 100644
--- a/tools/perf/util/annotate-data.h
+++ b/tools/perf/util/annotate-data.h
@@ -182,6 +182,9 @@ struct type_state_reg {
 	s32 offset;
 	bool ok;
 	bool caller_saved;
+	/* DWARF location range tracking for register lifetime */
+	bool lifetime_active;
+	u64 lifetime_end;
 	u8 kind;
 	u8 copied_from;
 };
diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c
index 1feefc329154..0710c875416f 100644
--- a/tools/perf/util/dwarf-aux.c
+++ b/tools/perf/util/dwarf-aux.c
@@ -1641,7 +1641,7 @@ static int __die_collect_vars_cb(Dwarf_Die *die_mem, void *arg)
 	Dwarf_Die type_die;
 	int tag = dwarf_tag(die_mem);
 	Dwarf_Attribute attr;
-	Dwarf_Addr base, start, end;
+	Dwarf_Addr base, start, end = 0;
 	Dwarf_Op *ops;
 	size_t nops;
 	struct die_var_type *vt;
@@ -1681,6 +1681,8 @@ static int __die_collect_vars_cb(Dwarf_Die *die_mem, void *arg)
 
 	vt->die_off = dwarf_dieoffset(&type_die);
 	vt->addr = start;
+	vt->end = end;
+	vt->has_range = (end != 0 || start != 0);
 	vt->reg = reg_from_dwarf_op(ops);
 	vt->offset = offset_from_dwarf_op(ops);
 	vt->next = *var_types;
@@ -1743,6 +1745,8 @@ static int __die_collect_global_vars_cb(Dwarf_Die *die_mem, void *arg)
 
 	vt->die_off = dwarf_dieoffset(&type_die);
 	vt->addr = ops->number;
+	vt->end = 0;
+	vt->has_range = false;
 	vt->reg = -1;
 	vt->offset = 0;
 	vt->next = *var_types;
diff --git a/tools/perf/util/dwarf-aux.h b/tools/perf/util/dwarf-aux.h
index 939a59c91796..a79968a2e573 100644
--- a/tools/perf/util/dwarf-aux.h
+++ b/tools/perf/util/dwarf-aux.h
@@ -148,10 +148,12 @@ struct die_var_type {
 	struct die_var_type *next;
 	u64 die_off;
 	u64 addr;
+	u64 end;        /* end address of location range */
 	int reg;
 	int offset;
 	/* Whether the register holds a address to the type */
 	bool is_reg_var_addr;
+	bool has_range; /* whether end is valid */
 };
 
 /* Return type info of a member at offset */

From a90407a5a89a29f3c4af89e55afe4d0489b8a81c Mon Sep 17 00:00:00 2001
From: Zecheng Li <zli94@ncsu.edu>
Date: Mon, 9 Mar 2026 13:55:24 -0400
Subject: [PATCH 046/131] perf dwarf-aux: Collect all variable locations for
 insn tracking

Previously, only the first DWARF location entry was collected for each
variable. This was based on the assumption that instruction tracking
could reconstruct the remaining state. However, variables may have
different locations across different address ranges, and relying solely
on instruction tracking can miss valid type information.

Change __die_collect_vars_cb() to iterate over all location entries
using dwarf_getlocations() in a loop. This ensures that variables with
multiple location ranges are properly tracked, improving type coverage.

Signed-off-by: Zecheng Li <zli94@ncsu.edu>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/dwarf-aux.c | 58 ++++++++++++++++++-------------------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c
index 0710c875416f..92db2fccc788 100644
--- a/tools/perf/util/dwarf-aux.c
+++ b/tools/perf/util/dwarf-aux.c
@@ -1645,6 +1645,7 @@ static int __die_collect_vars_cb(Dwarf_Die *die_mem, void *arg)
 	Dwarf_Op *ops;
 	size_t nops;
 	struct die_var_type *vt;
+	ptrdiff_t off;
 
 	if (tag != DW_TAG_variable && tag != DW_TAG_formal_parameter)
 		return DIE_FIND_CB_SIBLING;
@@ -1652,41 +1653,40 @@ static int __die_collect_vars_cb(Dwarf_Die *die_mem, void *arg)
 	if (dwarf_attr(die_mem, DW_AT_location, &attr) == NULL)
 		return DIE_FIND_CB_SIBLING;
 
-	/*
-	 * Only collect the first location as it can reconstruct the
-	 * remaining state by following the instructions.
-	 * start = 0 means it covers the whole range.
-	 */
-	if (dwarf_getlocations(&attr, 0, &base, &start, &end, &ops, &nops) <= 0)
-		return DIE_FIND_CB_SIBLING;
-
-	if (!check_allowed_ops(ops, nops))
-		return DIE_FIND_CB_SIBLING;
-
 	if (__die_get_real_type(die_mem, &type_die) == NULL)
 		return DIE_FIND_CB_SIBLING;
 
-	vt = malloc(sizeof(*vt));
-	if (vt == NULL)
-		return DIE_FIND_CB_END;
+	/*
+	 * Collect all location entries as variables may have different
+	 * locations across different address ranges.
+	 */
+	off = 0;
+	while ((off = dwarf_getlocations(&attr, off, &base, &start, &end, &ops, &nops)) > 0) {
+		if (!check_allowed_ops(ops, nops))
+			continue;
 
-	/* Usually a register holds the value of a variable */
-	vt->is_reg_var_addr = false;
+		vt = malloc(sizeof(*vt));
+		if (vt == NULL)
+			return DIE_FIND_CB_END;
 
-	if (((ops->atom >= DW_OP_breg0 && ops->atom <= DW_OP_breg31) ||
-	      ops->atom == DW_OP_bregx || ops->atom == DW_OP_fbreg) &&
-	      !is_breg_access_indirect(ops, nops))
-		/* The register contains an address of the variable. */
-		vt->is_reg_var_addr = true;
+		/* Usually a register holds the value of a variable */
+		vt->is_reg_var_addr = false;
 
-	vt->die_off = dwarf_dieoffset(&type_die);
-	vt->addr = start;
-	vt->end = end;
-	vt->has_range = (end != 0 || start != 0);
-	vt->reg = reg_from_dwarf_op(ops);
-	vt->offset = offset_from_dwarf_op(ops);
-	vt->next = *var_types;
-	*var_types = vt;
+		if (((ops->atom >= DW_OP_breg0 && ops->atom <= DW_OP_breg31) ||
+		      ops->atom == DW_OP_bregx || ops->atom == DW_OP_fbreg) &&
+		      !is_breg_access_indirect(ops, nops))
+			/* The register contains an address of the variable. */
+			vt->is_reg_var_addr = true;
+
+		vt->die_off = dwarf_dieoffset(&type_die);
+		vt->addr = start;
+		vt->end = end;
+		vt->has_range = (end != 0 || start != 0);
+		vt->reg = reg_from_dwarf_op(ops);
+		vt->offset = offset_from_dwarf_op(ops);
+		vt->next = *var_types;
+		*var_types = vt;
+	}
 
 	return DIE_FIND_CB_SIBLING;
 }

From d84db579d75fd32ea6dd7814c8cf6b1c8b45ac05 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Wed, 18 Mar 2026 16:45:56 -0700
Subject: [PATCH 047/131] perf evsel: Improve falling back from cycles

Switch to using evsel__match rather than comparing perf_event_attr
values, this is robust on hybrid architectures.
Ensure evsel->pmu matches the evsel->core.attr.
Remove exclude bits that get set in other fallback attempts when
switching the event.
Log the event name with modifiers when switching the event on fallback.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Richter <tmricht@linux.ibm.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/evsel.c | 45 ++++++++++++++++++++++++++++-------------
 tools/perf/util/evsel.h |  2 ++
 2 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index f59228c1a39e..bd14d9bbc91f 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -3785,25 +3785,42 @@ bool evsel__fallback(struct evsel *evsel, struct target *target, int err,
 {
 	int paranoid;
 
-	if ((err == ENOENT || err == ENXIO || err == ENODEV) &&
-	    evsel->core.attr.type   == PERF_TYPE_HARDWARE &&
-	    evsel->core.attr.config == PERF_COUNT_HW_CPU_CYCLES) {
+	if ((err == ENODEV || err == ENOENT || err == ENXIO) &&
+	    evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) {
 		/*
-		 * If it's cycles then fall back to hrtimer based cpu-clock sw
-		 * counter, which is always available even if no PMU support.
-		 *
-		 * PPC returns ENXIO until 2.6.37 (behavior changed with commit
-		 * b0a873e).
+		 * If it's the legacy hardware cycles event fails then fall back
+		 * to hrtimer based cpu-clock sw counter, which is always
+		 * available even if no PMU support. PPC returned ENXIO rather
+		 * than ENODEV or ENOENT until 2.6.37.
 		 */
-		evsel->core.attr.type   = PERF_TYPE_SOFTWARE;
+		evsel->pmu = perf_pmus__find_by_type(PERF_TYPE_SOFTWARE);
+		assert(evsel->pmu); /* software is a "well-known" and can't fail PMU type. */
+
+		/* Configure the event. */
+		evsel->core.attr.type = PERF_TYPE_SOFTWARE;
 		evsel->core.attr.config = target__has_cpu(target)
 			? PERF_COUNT_SW_CPU_CLOCK
 			: PERF_COUNT_SW_TASK_CLOCK;
-		scnprintf(msg, msgsize,
-			"The cycles event is not supported, trying to fall back to %s",
-			target__has_cpu(target) ? "cpu-clock" : "task-clock");
+		evsel->core.is_pmu_core = false;
 
+		/* Remove excludes for new event. */
+		if (evsel->fallenback_eacces) {
+			evsel->core.attr.exclude_kernel = 0;
+			evsel->core.attr.exclude_hv     = 0;
+			evsel->fallenback_eacces = false;
+		}
+		if (evsel->fallenback_eopnotsupp) {
+			evsel->core.attr.exclude_guest = 0;
+			evsel->fallenback_eopnotsupp = false;
+		}
+
+		/* Name is recomputed by evsel__name. */
 		zfree(&evsel->name);
+
+		/* Log message. */
+		scnprintf(msg, msgsize,
+			  "The cycles event is not supported, trying to fall back to %s",
+			  evsel__name(evsel));
 		return true;
 	} else if (err == EACCES && !evsel->core.attr.exclude_kernel &&
 		   (paranoid = perf_event_paranoid()) > 1) {
@@ -3830,7 +3847,7 @@ bool evsel__fallback(struct evsel *evsel, struct target *target, int err,
 			  " samples", paranoid);
 		evsel->core.attr.exclude_kernel = 1;
 		evsel->core.attr.exclude_hv     = 1;
-
+		evsel->fallenback_eacces = true;
 		return true;
 	} else if (err == EOPNOTSUPP && !evsel->core.attr.exclude_guest &&
 		   !evsel->exclude_GH) {
@@ -3851,7 +3868,7 @@ bool evsel__fallback(struct evsel *evsel, struct target *target, int err,
 		/* Apple M1 requires exclude_guest */
 		scnprintf(msg, msgsize, "Trying to fall back to excluding guest samples");
 		evsel->core.attr.exclude_guest = 1;
-
+		evsel->fallenback_eopnotsupp = true;
 		return true;
 	}
 no_fallback:
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index a3d754c029a0..97f57fab28ce 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -124,6 +124,8 @@ struct evsel {
 	bool			default_metricgroup; /* A member of the Default metricgroup */
 	bool			default_show_events; /* If a default group member, show the event */
 	bool			needs_uniquify;
+	bool			fallenback_eacces;
+	bool			fallenback_eopnotsupp;
 	struct hashmap		*per_pkg_mask;
 	int			err;
 	int			script_output_type;

From 8ebb69e549aa900cb51c0876c4f6ea03e5ece438 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Wed, 18 Mar 2026 16:45:57 -0700
Subject: [PATCH 048/131] perf target: Constify simple check functions

Allow the target to be const in callers.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Richter <tmricht@linux.ibm.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/target.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tools/perf/util/target.h b/tools/perf/util/target.h
index 84ebb9c940c6..bc2bff9c6842 100644
--- a/tools/perf/util/target.h
+++ b/tools/perf/util/target.h
@@ -49,22 +49,22 @@ uid_t parse_uid(const char *str);
 
 int target__strerror(struct target *target, int errnum, char *buf, size_t buflen);
 
-static inline bool target__has_task(struct target *target)
+static inline bool target__has_task(const struct target *target)
 {
 	return target->tid || target->pid;
 }
 
-static inline bool target__has_cpu(struct target *target)
+static inline bool target__has_cpu(const struct target *target)
 {
 	return target->system_wide || target->cpu_list;
 }
 
-static inline bool target__none(struct target *target)
+static inline bool target__none(const struct target *target)
 {
 	return !target__has_task(target) && !target__has_cpu(target);
 }
 
-static inline bool target__enable_on_exec(struct target *target)
+static inline bool target__enable_on_exec(const struct target *target)
 {
 	/*
 	 * Normally enable_on_exec should be set if:
@@ -75,12 +75,12 @@ static inline bool target__enable_on_exec(struct target *target)
 	return target__none(target) && !target->initial_delay;
 }
 
-static inline bool target__has_per_thread(struct target *target)
+static inline bool target__has_per_thread(const struct target *target)
 {
 	return target->system_wide && target->per_thread;
 }
 
-static inline bool target__uses_dummy_map(struct target *target)
+static inline bool target__uses_dummy_map(const struct target *target)
 {
 	bool use_dummy = false;
 

From 443556be8adc59126624eccd41f4150ec0e5a11a Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Wed, 18 Mar 2026 16:45:58 -0700
Subject: [PATCH 049/131] perf evsel: Constify option arguments to config
 functions

The options are used to configure the evsel but are not themselves
configured. Make the arguments const to better capture this.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Richter <tmricht@linux.ibm.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/evsel.c | 20 ++++++++++----------
 tools/perf/util/evsel.h |  8 ++++----
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index bd14d9bbc91f..54c8922a8e47 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1015,8 +1015,8 @@ uint16_t evsel__e_machine(struct evsel *evsel, uint32_t *e_flags)
 	return perf_session__e_machine(session, e_flags);
 }
 
-static void __evsel__config_callchain(struct evsel *evsel, struct record_opts *opts,
-				      struct callchain_param *param)
+static void __evsel__config_callchain(struct evsel *evsel, const struct record_opts *opts,
+				      const struct callchain_param *param)
 {
 	bool function = evsel__is_function_event(evsel);
 	struct perf_event_attr *attr = &evsel->core.attr;
@@ -1080,14 +1080,14 @@ static void __evsel__config_callchain(struct evsel *evsel, struct record_opts *o
 		attr->defer_callchain = 1;
 }
 
-void evsel__config_callchain(struct evsel *evsel, struct record_opts *opts,
-			     struct callchain_param *param)
+void evsel__config_callchain(struct evsel *evsel, const struct record_opts *opts,
+			     const struct callchain_param *param)
 {
 	if (param->enabled)
 		return __evsel__config_callchain(evsel, opts, param);
 }
 
-static void evsel__reset_callgraph(struct evsel *evsel, struct callchain_param *param)
+static void evsel__reset_callgraph(struct evsel *evsel, const struct callchain_param *param)
 {
 	struct perf_event_attr *attr = &evsel->core.attr;
 
@@ -1106,7 +1106,7 @@ static void evsel__reset_callgraph(struct evsel *evsel, struct callchain_param *
 
 static void evsel__apply_ratio_to_prev(struct evsel *evsel,
 				       struct perf_event_attr *attr,
-				       struct record_opts *opts,
+				       const struct record_opts *opts,
 				       const char *buf)
 {
 	struct perf_event_attr *prev_attr = NULL;
@@ -1170,7 +1170,7 @@ static void evsel__apply_ratio_to_prev(struct evsel *evsel,
 }
 
 static void evsel__apply_config_terms(struct evsel *evsel,
-				      struct record_opts *opts, bool track)
+				      const struct record_opts *opts, bool track)
 {
 	struct evsel_config_term *term;
 	struct list_head *config_terms = &evsel->config_terms;
@@ -1445,7 +1445,7 @@ void __weak arch_evsel__apply_ratio_to_prev(struct evsel *evsel __maybe_unused,
 {
 }
 
-static void evsel__set_default_freq_period(struct record_opts *opts,
+static void evsel__set_default_freq_period(const struct record_opts *opts,
 					   struct perf_event_attr *attr)
 {
 	if (opts->freq) {
@@ -1490,8 +1490,8 @@ bool evsel__is_offcpu_event(struct evsel *evsel)
  *     enable/disable events specifically, as there's no
  *     initial traced exec call.
  */
-void evsel__config(struct evsel *evsel, struct record_opts *opts,
-		   struct callchain_param *callchain)
+void evsel__config(struct evsel *evsel, const struct record_opts *opts,
+		   const struct callchain_param *callchain)
 {
 	struct evsel *leader = evsel__leader(evsel);
 	struct perf_event_attr *attr = &evsel->core.attr;
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 97f57fab28ce..339b5c08a33d 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -287,10 +287,10 @@ void evsel__set_priv_destructor(void (*destructor)(void *priv));
 
 struct callchain_param;
 
-void evsel__config(struct evsel *evsel, struct record_opts *opts,
-		   struct callchain_param *callchain);
-void evsel__config_callchain(struct evsel *evsel, struct record_opts *opts,
-			     struct callchain_param *callchain);
+void evsel__config(struct evsel *evsel, const struct record_opts *opts,
+		   const struct callchain_param *callchain);
+void evsel__config_callchain(struct evsel *evsel, const struct record_opts *opts,
+			     const  struct callchain_param *callchain);
 
 int __evsel__sample_size(u64 sample_type);
 void evsel__calc_id_pos(struct evsel *evsel);

From c006753c3aae432efda28d5aaea4b8fec0343da8 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Wed, 18 Mar 2026 16:45:59 -0700
Subject: [PATCH 050/131] perf callchain: Refactor callchain option parsing

record_opts__parse_callchain is shared by builtin-record and
builtin-trace, it is declared in callchain.h. Move the declaration to
callchain.c for consistency with the header. In other cases make the
option callback a small static stub that then calls into callchain.c.

Make the no argument '-g' callchain option just a short-cut for
'--call-graph fp' so that there is consistency in how the arguments
are handled. This requires the const char* string to be strdup-ed in
__parse_callchain_report_opt. For consistency also make
parse_callchain_record use strdup and remove some unnecessary
casts. Also, be more explicit about the '-g' behavior if there is a
.perfconfig file setting.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Richter <tmricht@linux.ibm.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/builtin-record.c | 65 ++++++++-------------------------
 tools/perf/builtin-top.c    | 25 +++++++++----
 tools/perf/builtin-trace.c  |  9 ++++-
 tools/perf/util/callchain.c | 73 ++++++++++++++++++++++++++++++-------
 tools/perf/util/callchain.h | 12 ++----
 5 files changed, 104 insertions(+), 80 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 40917a0be238..59b8125d1b13 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -2975,65 +2975,30 @@ out_delete_session:
 	return status;
 }
 
-static void callchain_debug(struct callchain_param *callchain)
-{
-	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
-
-	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
-
-	if (callchain->record_mode == CALLCHAIN_DWARF)
-		pr_debug("callchain: stack dump size %d\n",
-			 callchain->dump_size);
-}
-
-int record_opts__parse_callchain(struct record_opts *record,
-				 struct callchain_param *callchain,
-				 const char *arg, bool unset)
-{
-	int ret;
-	callchain->enabled = !unset;
-
-	/* --no-call-graph */
-	if (unset) {
-		callchain->record_mode = CALLCHAIN_NONE;
-		pr_debug("callchain: disabled\n");
-		return 0;
-	}
-
-	ret = parse_callchain_record_opt(arg, callchain);
-	if (!ret) {
-		/* Enable data address sampling for DWARF unwind. */
-		if (callchain->record_mode == CALLCHAIN_DWARF &&
-		    !record->record_data_mmap_set)
-			record->record_data_mmap = true;
-		callchain_debug(callchain);
-	}
-
-	return ret;
-}
-
-int record_parse_callchain_opt(const struct option *opt,
+static int record_parse_callchain_opt(const struct option *opt,
 			       const char *arg,
 			       int unset)
 {
 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
 }
 
-int record_callchain_opt(const struct option *opt,
-			 const char *arg __maybe_unused,
-			 int unset __maybe_unused)
+static int record_callchain_opt(const struct option *opt,
+				const char *arg __maybe_unused,
+				int unset)
 {
-	struct callchain_param *callchain = opt->value;
+	/*
+	 * The -g option only sets the callchain if not already configured by
+	 * .perfconfig. It does, however, enable it.
+	 */
+	if (callchain_param.record_mode != CALLCHAIN_NONE) {
+		callchain_param.enabled = true;
+		return 0;
+	}
 
-	callchain->enabled = true;
-
-	if (callchain->record_mode == CALLCHAIN_NONE)
-		callchain->record_mode = CALLCHAIN_FP;
-
-	callchain_debug(callchain);
-	return 0;
+	return record_opts__parse_callchain(opt->value, &callchain_param, "fp", unset);
 }
 
+
 static int perf_record_config(const char *var, const char *value, void *cb)
 {
 	struct record *rec = cb;
@@ -3525,7 +3490,7 @@ static struct option __record_options[] = {
 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
 		     record__mmap_flush_parse),
-	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
+	OPT_CALLBACK_NOOPT('g', NULL, &record.opts,
 			   NULL, "enables call-graph recording" ,
 			   &record_callchain_opt),
 	OPT_CALLBACK(0, "call-graph", &record.opts,
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 710604c4f6f6..b6726f4dffb3 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1386,13 +1386,6 @@ out_join_thread:
 	return ret;
 }
 
-static int
-callchain_opt(const struct option *opt, const char *arg, int unset)
-{
-	symbol_conf.use_callchain = true;
-	return record_callchain_opt(opt, arg, unset);
-}
-
 static int
 parse_callchain_opt(const struct option *opt, const char *arg, int unset)
 {
@@ -1413,6 +1406,24 @@ parse_callchain_opt(const struct option *opt, const char *arg, int unset)
 	return parse_callchain_top_opt(arg);
 }
 
+static int
+callchain_opt(const struct option *opt, const char *arg __maybe_unused, int unset)
+{
+	struct callchain_param *callchain = opt->value;
+
+	/*
+	 * The -g option only sets the callchain if not already configured by
+	 * .perfconfig. It does, however, enable it.
+	 */
+	if (callchain->record_mode != CALLCHAIN_NONE) {
+		callchain->enabled = true;
+		return 0;
+	}
+
+	return parse_callchain_opt(opt, "fp", unset);
+}
+
+
 static int perf_top_config(const char *var, const char *value, void *cb __maybe_unused)
 {
 	if (!strcmp(var, "top.call-graph")) {
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 1c38f3d16a31..f487fbaa0ad6 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -5300,6 +5300,13 @@ static int trace__parse_summary_mode(const struct option *opt, const char *str,
 	return 0;
 }
 
+static int trace_parse_callchain_opt(const struct option *opt,
+				     const char *arg,
+				     int unset)
+{
+	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
+}
+
 static int trace__config(const char *var, const char *value, void *arg)
 {
 	struct trace *trace = arg;
@@ -5447,7 +5454,7 @@ int cmd_trace(int argc, const char **argv)
 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
 	OPT_CALLBACK(0, "call-graph", &trace.opts,
 		     "record_mode[,record_size]", record_callchain_help,
-		     &record_parse_callchain_opt),
+		     &trace_parse_callchain_opt),
 	OPT_BOOLEAN(0, "libtraceevent_print", &trace.libtraceevent_print,
 		    "Use libtraceevent to print the tracepoint arguments."),
 	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 8ff0898799ee..f879b84f8ff9 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -30,6 +30,7 @@
 #include "map.h"
 #include "callchain.h"
 #include "branch.h"
+#include "record.h"
 #include "symbol.h"
 #include "thread.h"
 #include "util.h"
@@ -170,7 +171,7 @@ static int get_stack_size(const char *str, unsigned long *_size)
 static int
 __parse_callchain_report_opt(const char *arg, bool allow_record_opt)
 {
-	char *tok;
+	char *tok, *arg_copy;
 	char *endptr, *saveptr = NULL;
 	bool minpcnt_set = false;
 	bool record_opt_set = false;
@@ -182,12 +183,17 @@ __parse_callchain_report_opt(const char *arg, bool allow_record_opt)
 	if (!arg)
 		return 0;
 
-	while ((tok = strtok_r((char *)arg, ",", &saveptr)) != NULL) {
+	arg_copy = strdup(arg);
+	if (!arg_copy)
+		return -ENOMEM;
+
+	tok = strtok_r(arg_copy, ",", &saveptr);
+	while (tok) {
 		if (!strncmp(tok, "none", strlen(tok))) {
 			callchain_param.mode = CHAIN_NONE;
 			callchain_param.enabled = false;
 			symbol_conf.use_callchain = false;
-			return 0;
+			goto out;
 		}
 
 		if (!parse_callchain_mode(tok) ||
@@ -214,30 +220,35 @@ try_numbers:
 			unsigned long size = 0;
 
 			if (get_stack_size(tok, &size) < 0)
-				return -1;
+				goto err_out;
 			callchain_param.dump_size = size;
 			try_stack_size = false;
 		} else if (!minpcnt_set) {
 			/* try to get the min percent */
 			callchain_param.min_percent = strtod(tok, &endptr);
 			if (tok == endptr)
-				return -1;
+				goto err_out;
 			minpcnt_set = true;
 		} else {
 			/* try print limit at last */
 			callchain_param.print_limit = strtoul(tok, &endptr, 0);
 			if (tok == endptr)
-				return -1;
+				goto err_out;
 		}
 next:
-		arg = NULL;
+		tok = strtok_r(NULL, ",", &saveptr);
 	}
 
 	if (callchain_register_param(&callchain_param) < 0) {
 		pr_err("Can't register callchain params\n");
-		return -1;
+		goto err_out;
 	}
+out:
+	free(arg_copy);
 	return 0;
+err_out:
+	free(arg_copy);
+	return -1;
 }
 
 int parse_callchain_report_opt(const char *arg)
@@ -257,14 +268,12 @@ int parse_callchain_record(const char *arg, struct callchain_param *param)
 	int ret = -1;
 
 	/* We need buffer that we know we can write to. */
-	buf = malloc(strlen(arg) + 1);
+	buf = strdup(arg);
 	if (!buf)
 		return -ENOMEM;
 
-	strcpy(buf, arg);
-
-	tok = strtok_r((char *)buf, ",", &saveptr);
-	name = tok ? : (char *)buf;
+	tok = strtok_r(buf, ",", &saveptr);
+	name = tok ? : buf;
 
 	do {
 		/* Framepointer style */
@@ -328,6 +337,44 @@ int parse_callchain_record(const char *arg, struct callchain_param *param)
 	return ret;
 }
 
+static void callchain_debug(const struct callchain_param *callchain)
+{
+	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
+
+	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
+
+	if (callchain->record_mode == CALLCHAIN_DWARF)
+		pr_debug("callchain: stack dump size %d\n",
+			 callchain->dump_size);
+}
+
+int record_opts__parse_callchain(struct record_opts *record,
+				 struct callchain_param *callchain,
+				 const char *arg, bool unset)
+{
+	int ret;
+
+	callchain->enabled = !unset;
+
+	/* --no-call-graph */
+	if (unset) {
+		callchain->record_mode = CALLCHAIN_NONE;
+		pr_debug("callchain: disabled\n");
+		return 0;
+	}
+
+	ret = parse_callchain_record_opt(arg, callchain);
+	if (!ret) {
+		/* Enable data address sampling for DWARF unwind. */
+		if (callchain->record_mode == CALLCHAIN_DWARF &&
+		    !record->record_data_mmap_set)
+			record->record_data_mmap = true;
+		callchain_debug(callchain);
+	}
+
+	return ret;
+}
+
 int perf_callchain_config(const char *var, const char *value)
 {
 	char *endptr;
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index df54ddb8c0cb..06d463ccc7a0 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -9,11 +9,13 @@
 
 struct addr_location;
 struct evsel;
+struct hist_entry;
+struct hists;
 struct ip_callchain;
 struct map;
 struct perf_sample;
+struct record_opts;
 struct thread;
-struct hists;
 
 #define HELP_PAD "\t\t\t\t"
 
@@ -237,14 +239,6 @@ struct callchain_cursor *get_tls_callchain_cursor(void);
 int callchain_cursor__copy(struct callchain_cursor *dst,
 			   struct callchain_cursor *src);
 
-struct option;
-struct hist_entry;
-
-int record_parse_callchain_opt(const struct option *opt, const char *arg, int unset);
-int record_callchain_opt(const struct option *opt, const char *arg, int unset);
-
-struct record_opts;
-
 int record_opts__parse_callchain(struct record_opts *record,
 				 struct callchain_param *callchain,
 				 const char *arg, bool unset);

From ca76fb67ebdd5e1a30a242d06dc096fddd670734 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Wed, 18 Mar 2026 16:46:00 -0700
Subject: [PATCH 051/131] perf evlist: Improve default event for s390

Frame pointer callchains are not supported on s390 and dwarf
callchains are only supported on software events.

Switch the default event from the hardware 'cycles' event to the
software 'cpu-clock' or 'task-clock' on s390 if callchains are
enabled. Move some of the target initialization earlier in builtin-top
and builtin-record, so it is ready for use by evlist__new_default.

If frame pointer callchains are requested on s390 show a
warning. Modify the '-g' option of `perf top` and `perf record` to
default to dwarf callchains on s390.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Richter <tmricht@linux.ibm.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/builtin-record.c      | 18 +++++++++++-------
 tools/perf/builtin-top.c         | 23 ++++++++++++-----------
 tools/perf/tests/event_update.c  |  4 +++-
 tools/perf/tests/expand-cgroup.c |  4 +++-
 tools/perf/tests/perf-record.c   |  7 +++++--
 tools/perf/tests/topology.c      |  4 +++-
 tools/perf/util/evlist.c         | 32 +++++++++++++++++++++-----------
 tools/perf/util/evlist.h         |  2 +-
 tools/perf/util/evsel.c          |  5 +++++
 9 files changed, 64 insertions(+), 35 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 59b8125d1b13..3276ffdc3141 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -55,6 +55,7 @@
 #include "asm/bug.h"
 #include "perf.h"
 #include "cputopo.h"
+#include "dwarf-regs.h"
 
 #include <errno.h>
 #include <inttypes.h>
@@ -2995,7 +2996,9 @@ static int record_callchain_opt(const struct option *opt,
 		return 0;
 	}
 
-	return record_opts__parse_callchain(opt->value, &callchain_param, "fp", unset);
+	return record_opts__parse_callchain(opt->value, &callchain_param,
+					    EM_HOST != EM_S390 ? "fp" : "dwarf",
+					    unset);
 }
 
 
@@ -4095,8 +4098,11 @@ int cmd_record(int argc, const char **argv)
 
 	perf_debuginfod_setup(&record.debuginfod);
 
-	/* Make system wide (-a) the default target. */
-	if (!argc && target__none(&rec->opts.target))
+	/*
+	 * Use system wide (-a) for the default target (ie. when no
+	 * workload). User ID filtering also implies system-wide.
+	 */
+	if ((!argc && target__none(&rec->opts.target)) || rec->uid_str)
 		rec->opts.target.system_wide = true;
 
 	if (nr_cgroups && !rec->opts.target.system_wide) {
@@ -4274,7 +4280,8 @@ int cmd_record(int argc, const char **argv)
 		record.opts.tail_synthesize = true;
 
 	if (rec->evlist->core.nr_entries == 0) {
-		struct evlist *def_evlist = evlist__new_default();
+		struct evlist *def_evlist = evlist__new_default(&rec->opts.target,
+								callchain_param.enabled);
 
 		if (!def_evlist)
 			goto out;
@@ -4303,9 +4310,6 @@ int cmd_record(int argc, const char **argv)
 		err = parse_uid_filter(rec->evlist, uid);
 		if (err)
 			goto out;
-
-		/* User ID filtering implies system wide. */
-		rec->opts.target.system_wide = true;
 	}
 
 	/* Enable ignoring missing threads when -p option is defined. */
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index b6726f4dffb3..37950efb28ac 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -56,6 +56,7 @@
 #include "util/debug.h"
 #include "util/ordered-events.h"
 #include "util/pfm.h"
+#include "dwarf-regs.h"
 
 #include <assert.h>
 #include <elf.h>
@@ -1420,7 +1421,7 @@ callchain_opt(const struct option *opt, const char *arg __maybe_unused, int unse
 		return 0;
 	}
 
-	return parse_callchain_opt(opt, "fp", unset);
+	return parse_callchain_opt(opt, EM_HOST != EM_S390 ? "fp" : "dwarf", unset);
 }
 
 
@@ -1705,8 +1706,17 @@ int cmd_top(int argc, const char **argv)
 	if (annotate_check_args() < 0)
 		goto out_delete_evlist;
 
+	status = target__validate(target);
+	if (status) {
+		target__strerror(target, status, errbuf, BUFSIZ);
+		ui__warning("%s\n", errbuf);
+	}
+
+	if (target__none(target))
+		target->system_wide = true;
+
 	if (!top.evlist->core.nr_entries) {
-		struct evlist *def_evlist = evlist__new_default();
+		struct evlist *def_evlist = evlist__new_default(target, callchain_param.enabled);
 
 		if (!def_evlist)
 			goto out_delete_evlist;
@@ -1799,12 +1809,6 @@ int cmd_top(int argc, const char **argv)
 		goto out_delete_evlist;
 	}
 
-	status = target__validate(target);
-	if (status) {
-		target__strerror(target, status, errbuf, BUFSIZ);
-		ui__warning("%s\n", errbuf);
-	}
-
 	if (top.uid_str) {
 		uid_t uid = parse_uid(top.uid_str);
 
@@ -1818,9 +1822,6 @@ int cmd_top(int argc, const char **argv)
 			goto out_delete_evlist;
 	}
 
-	if (target__none(target))
-		target->system_wide = true;
-
 	if (evlist__create_maps(top.evlist, target) < 0) {
 		ui__error("Couldn't create thread/CPU maps: %s\n",
 			  errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
diff --git a/tools/perf/tests/event_update.c b/tools/perf/tests/event_update.c
index cb9e6de2e033..facc65e29f20 100644
--- a/tools/perf/tests/event_update.c
+++ b/tools/perf/tests/event_update.c
@@ -8,6 +8,7 @@
 #include "header.h"
 #include "machine.h"
 #include "util/synthetic-events.h"
+#include "target.h"
 #include "tool.h"
 #include "tests.h"
 #include "debug.h"
@@ -81,7 +82,8 @@ static int test__event_update(struct test_suite *test __maybe_unused, int subtes
 {
 	struct evsel *evsel;
 	struct event_name tmp;
-	struct evlist *evlist = evlist__new_default();
+	struct target target = {};
+	struct evlist *evlist = evlist__new_default(&target, /*sample_callchains=*/false);
 
 	TEST_ASSERT_VAL("failed to get evlist", evlist);
 
diff --git a/tools/perf/tests/expand-cgroup.c b/tools/perf/tests/expand-cgroup.c
index c7b32a220ca1..dd547f2f77cc 100644
--- a/tools/perf/tests/expand-cgroup.c
+++ b/tools/perf/tests/expand-cgroup.c
@@ -8,6 +8,7 @@
 #include "parse-events.h"
 #include "pmu-events/pmu-events.h"
 #include "pfm.h"
+#include "target.h"
 #include <subcmd/parse-options.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -99,7 +100,8 @@ out:	for (i = 0; i < nr_events; i++)
 static int expand_default_events(void)
 {
 	int ret;
-	struct evlist *evlist = evlist__new_default();
+	struct target target = {};
+	struct evlist *evlist = evlist__new_default(&target, /*sample_callchains=*/false);
 
 	TEST_ASSERT_VAL("failed to get evlist", evlist);
 
diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c
index efbd9cd60c63..c6e31ab8a6b8 100644
--- a/tools/perf/tests/perf-record.c
+++ b/tools/perf/tests/perf-record.c
@@ -84,8 +84,11 @@ static int test__PERF_RECORD(struct test_suite *test __maybe_unused, int subtest
 	CPU_ZERO_S(cpu_mask_size, cpu_mask);
 
 	perf_sample__init(&sample, /*all=*/false);
-	if (evlist == NULL) /* Fallback for kernels lacking PERF_COUNT_SW_DUMMY */
-		evlist = evlist__new_default();
+	if (evlist == NULL) { /* Fallback for kernels lacking PERF_COUNT_SW_DUMMY */
+		struct target target = {};
+
+		evlist = evlist__new_default(&target, /*sample_callchains=*/false);
+	}
 
 	if (evlist == NULL) {
 		pr_debug("Not enough memory to create evlist\n");
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index ec01150d208d..a34a7ab19a80 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -9,6 +9,7 @@
 #include "evlist.h"
 #include "debug.h"
 #include "pmus.h"
+#include "target.h"
 #include <linux/err.h>
 
 #define TEMPL "/tmp/perf-test-XXXXXX"
@@ -37,11 +38,12 @@ static int session_write_header(char *path)
 		.path = path,
 		.mode = PERF_DATA_MODE_WRITE,
 	};
+	struct target target = {};
 
 	session = perf_session__new(&data, NULL);
 	TEST_ASSERT_VAL("can't get session", !IS_ERR(session));
 
-	session->evlist = evlist__new_default();
+	session->evlist = evlist__new_default(&target, /*sample_callchains=*/false);
 	TEST_ASSERT_VAL("can't get evlist", session->evlist);
 	session->evlist->session = session;
 
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 591bdf0b3e2a..c702741a9173 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -13,6 +13,7 @@
 #include "util/mmap.h"
 #include "thread_map.h"
 #include "target.h"
+#include "dwarf-regs.h"
 #include "evlist.h"
 #include "evsel.h"
 #include "record.h"
@@ -98,38 +99,47 @@ struct evlist *evlist__new(void)
 	return evlist;
 }
 
-struct evlist *evlist__new_default(void)
+struct evlist *evlist__new_default(const struct target *target, bool sample_callchains)
 {
 	struct evlist *evlist = evlist__new();
 	bool can_profile_kernel;
 	struct perf_pmu *pmu = NULL;
+	struct evsel *evsel;
+	char buf[256];
+	int err;
 
 	if (!evlist)
 		return NULL;
 
 	can_profile_kernel = perf_event_paranoid_check(1);
 
-	while ((pmu = perf_pmus__scan_core(pmu)) != NULL) {
-		char buf[256];
-		int err;
-
-		snprintf(buf, sizeof(buf), "%s/cycles/%s", pmu->name,
+	if (EM_HOST == EM_S390 && sample_callchains) {
+		snprintf(buf, sizeof(buf), "software/%s/%s",
+			 target__has_cpu(target) ? "cpu-clock" : "task-clock",
 			 can_profile_kernel ? "P" : "Pu");
 		err = parse_event(evlist, buf);
-		if (err) {
-			evlist__delete(evlist);
-			return NULL;
+		if (err)
+			goto out_err;
+	} else {
+		while ((pmu = perf_pmus__scan_core(pmu)) != NULL) {
+			snprintf(buf, sizeof(buf), "%s/cycles/%s", pmu->name,
+				can_profile_kernel ? "P" : "Pu");
+			err = parse_event(evlist, buf);
+			if (err)
+				goto out_err;
 		}
 	}
 
+	/* If there is only 1 event a sample identifier isn't necessary. */
 	if (evlist->core.nr_entries > 1) {
-		struct evsel *evsel;
-
 		evlist__for_each_entry(evlist, evsel)
 			evsel__set_sample_id(evsel, /*can_sample_identifier=*/false);
 	}
 
 	return evlist;
+out_err:
+	evlist__delete(evlist);
+	return NULL;
 }
 
 struct evlist *evlist__new_dummy(void)
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index d17c3b57a409..e507f5f20ef6 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -104,7 +104,7 @@ struct evsel_str_handler {
 };
 
 struct evlist *evlist__new(void);
-struct evlist *evlist__new_default(void);
+struct evlist *evlist__new_default(const struct target *target, bool sample_callchains);
 struct evlist *evlist__new_dummy(void);
 void evlist__init(struct evlist *evlist, struct perf_cpu_map *cpus,
 		  struct perf_thread_map *threads);
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 54c8922a8e47..5a294595a677 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -1021,6 +1021,11 @@ static void __evsel__config_callchain(struct evsel *evsel, const struct record_o
 	bool function = evsel__is_function_event(evsel);
 	struct perf_event_attr *attr = &evsel->core.attr;
 
+	if (EM_HOST == EM_S390 && param->record_mode == CALLCHAIN_FP) {
+		pr_warning_once(
+			"Framepointer unwinding lacks kernel support. Use '--call-graph dwarf'\n");
+	}
+
 	evsel__set_sample_bit(evsel, CALLCHAIN);
 
 	attr->sample_max_stack = param->max_stack;

From 5c980ab238c8a9e2b24221603f11eadc98a7f45e Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@arm.com>
Date: Tue, 17 Mar 2026 18:58:00 +0000
Subject: [PATCH 052/131] tools build: Correct link flags for libopenssl

The perf static build reports that the BPF skeleton is disabled due to
the missing libopenssl feature.

Use PKG_CONFIG to determine the link flags for libopenssl.  Add
"--static" to the PKG_CONFIG command for static linking.

Fixes: 7678523109d1 ("tools/build: Add a feature test for libopenssl")
Signed-off-by: Leo Yan <leo.yan@arm.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/build/feature/Makefile | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index 1fbcb3ce74d2..f163a245837a 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -103,12 +103,18 @@ else
   endif
 endif
 
+ifeq ($(findstring -static,${LDFLAGS}),-static)
+  PKG_CONFIG += --static
+endif
+
 all: $(FILES)
 
 __BUILD = $(CC) $(CFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.c,$(@F)) $(LDFLAGS)
   BUILD = $(__BUILD) > $(@:.bin=.make.output) 2>&1
   BUILD_BFD = $(BUILD) -DPACKAGE='"perf"' -lbfd -ldl
-  BUILD_ALL = $(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -lslang $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -ldl -lz -llzma -lzstd -lssl
+  BUILD_ALL = $(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -lslang \
+	      $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -ldl -lz -llzma -lzstd \
+	      $(shell $(PKG_CONFIG) --libs --cflags openssl 2>/dev/null)
 
 __BUILDXX = $(CXX) $(CXXFLAGS) -MD -Wall -Werror -o $@ $(patsubst %.bin,%.cpp,$(@F)) $(LDFLAGS)
   BUILDXX = $(__BUILDXX) > $(@:.bin=.make.output) 2>&1
@@ -384,7 +390,7 @@ $(OUTPUT)test-libpfm4.bin:
 	$(BUILD) -lpfm
 
 $(OUTPUT)test-libopenssl.bin:
-	$(BUILD) -lssl
+	$(BUILD) $(shell $(PKG_CONFIG) --libs --cflags openssl 2>/dev/null)
 
 $(OUTPUT)test-bpftool-skeletons.bin:
 	$(SYSTEM_BPFTOOL) version | grep '^features:.*skeletons' \

From 46a009cf0d85cba05d4667214db18a4c20dd6b8e Mon Sep 17 00:00:00 2001
From: Thomas Richter <tmricht@linux.ibm.com>
Date: Thu, 19 Mar 2026 09:47:54 +0100
Subject: [PATCH 053/131] perf record: Add support for arch_sdt_arg_parse_op()
 on s390

commit e5e66adfe45a6 ("perf regs: Remove __weak attributive arch_sdt_arg_parse_op() function")
removes arch_sdt_arg_parse_op() functions and reveals missing s390 support.
The following warning is printed:

  Unknown ELF machine 22, standard arguments parse will be skipped.

ELF machine 22 is the EM_S390 host. This happens with command
  # ./perf record -v -- stress-ng -t 1s --matrix 0
when the event is not specified.

Add s390 specific __perf_sdt_arg_parse_op_s390() function to support
-architecture calls to arch_sdt_arg_parse_op() for s390.
The warning disappears.

Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Tested-by: Jan Polensky <japo@linux.ibm.com>
Cc: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 .../perf/util/perf-regs-arch/perf_regs_s390.c | 78 +++++++++++++++++++
 tools/perf/util/perf_regs.c                   |  3 +
 tools/perf/util/perf_regs.h                   |  1 +
 3 files changed, 82 insertions(+)

diff --git a/tools/perf/util/perf-regs-arch/perf_regs_s390.c b/tools/perf/util/perf-regs-arch/perf_regs_s390.c
index c61df24edf0f..19f219225183 100644
--- a/tools/perf/util/perf-regs-arch/perf_regs_s390.c
+++ b/tools/perf/util/perf-regs-arch/perf_regs_s390.c
@@ -1,7 +1,13 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#include <errno.h>
+#include <regex.h>
 #include "../perf_regs.h"
 #include "../../arch/s390/include/perf_regs.h"
+#include "debug.h"
+
+#include <linux/zalloc.h>
+#include <linux/kernel.h>
 
 uint64_t __perf_reg_mask_s390(bool intr __maybe_unused)
 {
@@ -95,3 +101,75 @@ uint64_t __perf_reg_sp_s390(void)
 {
 	return PERF_REG_S390_R15;
 }
+
+/* %rXX */
+#define SDT_OP_REGEX1  "^(%r([0-9]|1[0-5]))$"
+/* +-###(%rXX) */
+#define SDT_OP_REGEX2  "^([+-]?[0-9]+\\(%r([0-9]|1[0-5])\\))$"
+static regex_t sdt_op_regex1, sdt_op_regex2;
+
+static int sdt_init_op_regex(void)
+{
+	static int initialized;
+	int ret = 0;
+
+	if (initialized)
+		return 0;
+
+	ret = regcomp(&sdt_op_regex1, SDT_OP_REGEX1, REG_EXTENDED);
+	if (ret)
+		goto error;
+	initialized = 1;
+
+	ret = regcomp(&sdt_op_regex2, SDT_OP_REGEX2, REG_EXTENDED);
+	if (ret)
+		goto free_regex1;
+	initialized = 2;
+
+	return 0;
+
+free_regex1:
+	regfree(&sdt_op_regex1);
+error:
+	pr_debug4("Regex compilation error, initialized %d\n", initialized);
+	initialized = 0;
+	return ret;
+}
+
+/*
+ * Parse OP and convert it into uprobe format, which is, +/-NUM(%gprREG).
+ * Possible variants of OP are:
+ *	Format		Example
+ *	-------------------------
+ *	NUM(%rREG)	48(%r1)
+ *	-NUM(%rREG)	-48(%r1)
+ *	+NUM(%rREG)	+48(%r1)
+ *	%rREG		%r1
+ */
+int __perf_sdt_arg_parse_op_s390(char *old_op, char **new_op)
+{
+	int ret, new_len;
+	regmatch_t rm[6];
+
+	*new_op = NULL;
+	ret = sdt_init_op_regex();
+	if (ret)
+		return -EINVAL;
+
+	if (!regexec(&sdt_op_regex1, old_op, ARRAY_SIZE(rm), rm, 0) ||
+	    !regexec(&sdt_op_regex2, old_op, ARRAY_SIZE(rm), rm, 0)) {
+		new_len = 1;    /* NULL byte */
+		new_len += (int)(rm[1].rm_eo - rm[1].rm_so);
+		*new_op = zalloc(new_len);
+		if (!*new_op)
+			return -ENOMEM;
+
+		scnprintf(*new_op, new_len, "%.*s",
+			  (int)(rm[1].rm_eo - rm[1].rm_so), old_op + rm[1].rm_so);
+	} else {
+		pr_debug4("Skipping unsupported SDT argument: %s\n", old_op);
+		return SDT_ARG_SKIP;
+	}
+
+	return SDT_ARG_VALID;
+}
diff --git a/tools/perf/util/perf_regs.c b/tools/perf/util/perf_regs.c
index 5b8f34beb24e..f52b0e1f7fc7 100644
--- a/tools/perf/util/perf_regs.c
+++ b/tools/perf/util/perf_regs.c
@@ -23,6 +23,9 @@ int perf_sdt_arg_parse_op(uint16_t e_machine, char *old_op, char **new_op)
 	case EM_X86_64:
 		ret = __perf_sdt_arg_parse_op_x86(old_op, new_op);
 		break;
+	case EM_S390:
+		ret = __perf_sdt_arg_parse_op_s390(old_op, new_op);
+		break;
 	default:
 		pr_debug("Unknown ELF machine %d, standard arguments parse will be skipped.\n",
 			 e_machine);
diff --git a/tools/perf/util/perf_regs.h b/tools/perf/util/perf_regs.h
index 7c04700bf837..573f0d1dfe04 100644
--- a/tools/perf/util/perf_regs.h
+++ b/tools/perf/util/perf_regs.h
@@ -62,6 +62,7 @@ uint64_t __perf_reg_mask_s390(bool intr);
 const char *__perf_reg_name_s390(int id);
 uint64_t __perf_reg_ip_s390(void);
 uint64_t __perf_reg_sp_s390(void);
+int __perf_sdt_arg_parse_op_s390(char *old_op, char **new_op);
 
 int __perf_sdt_arg_parse_op_x86(char *old_op, char **new_op);
 uint64_t __perf_reg_mask_x86(bool intr);

From cfaade34b52aa1ec553044255702c4b31b57c005 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 19 Mar 2026 16:33:48 -0700
Subject: [PATCH 054/131] perf lock: Fix option value type in parse_max_stack

The value is a void* and the address of an int, max_stack_depth, is
set up in the perf lock options. The parse_max_stack function treats
the int* as a long*, make this more correct by declaring the value to
be an int*.

Fixes: 0a277b622670 ("perf lock contention: Check --max-stack option")
Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/builtin-lock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index e8962c985d34..5585aeb97684 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -2250,7 +2250,7 @@ static int parse_map_entry(const struct option *opt, const char *str,
 static int parse_max_stack(const struct option *opt, const char *str,
 			   int unset __maybe_unused)
 {
-	unsigned long *len = (unsigned long *)opt->value;
+	int *len = opt->value;
 	long val;
 	char *endptr;
 

From 44311ae84ad9177fb311aee856027861c22f17b2 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Thu, 19 Mar 2026 16:33:49 -0700
Subject: [PATCH 055/131] perf stat: Fix opt->value type for parse_cache_level

Commit f5803651b4a4 ("perf stat: Choose the most disaggregate command
line option") changed aggregation option handling for `perf stat` but
not `perf stat report` leading to parse_cache_level being passed a
struct in the `perf stat` case but erroneously an aggr_mode enum value
for `perf stat report`. Change the `perf stat report` aggregation
handling to use the same opt_aggr_mode as `perf stat`. Also, just pass
the boolean for consistency with other boolean argument handling.

Fixes: f5803651b4a4 ("perf stat: Choose the most disaggregate command line option")
Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/builtin-stat.c | 43 +++++++++++++++++++++------------------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 73c2ba7e3076..2eb76d7476b7 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -164,7 +164,7 @@ struct opt_aggr_mode {
 };
 
 /* Turn command line option into most generic aggregation mode setting. */
-static enum aggr_mode opt_aggr_mode_to_aggr_mode(struct opt_aggr_mode *opt_mode)
+static enum aggr_mode opt_aggr_mode_to_aggr_mode(const struct opt_aggr_mode *opt_mode)
 {
 	enum aggr_mode mode = AGGR_GLOBAL;
 
@@ -1219,8 +1219,8 @@ static int parse_cache_level(const struct option *opt,
 			     int unset __maybe_unused)
 {
 	int level;
-	struct opt_aggr_mode *opt_aggr_mode = (struct opt_aggr_mode *)opt->value;
-	u32 *aggr_level = (u32 *)opt->data;
+	bool *per_cache = opt->value;
+	u32 *aggr_level = opt->data;
 
 	/*
 	 * If no string is specified, aggregate based on the topology of
@@ -1258,7 +1258,7 @@ static int parse_cache_level(const struct option *opt,
 		return -EINVAL;
 	}
 out:
-	opt_aggr_mode->cache = true;
+	*per_cache = true;
 	*aggr_level = level;
 	return 0;
 }
@@ -2305,24 +2305,23 @@ static struct perf_stat perf_stat = {
 static int __cmd_report(int argc, const char **argv)
 {
 	struct perf_session *session;
+	struct opt_aggr_mode opt_mode = {};
 	const struct option options[] = {
 	OPT_STRING('i', "input", &input_name, "file", "input file name"),
-	OPT_SET_UINT(0, "per-socket", &perf_stat.aggr_mode,
-		     "aggregate counts per processor socket", AGGR_SOCKET),
-	OPT_SET_UINT(0, "per-die", &perf_stat.aggr_mode,
-		     "aggregate counts per processor die", AGGR_DIE),
-	OPT_SET_UINT(0, "per-cluster", &perf_stat.aggr_mode,
-		     "aggregate counts perf processor cluster", AGGR_CLUSTER),
-	OPT_CALLBACK_OPTARG(0, "per-cache", &perf_stat.aggr_mode, &perf_stat.aggr_level,
-			    "cache level",
-			    "aggregate count at this cache level (Default: LLC)",
+	OPT_BOOLEAN(0, "per-thread", &opt_mode.thread, "aggregate counts per thread"),
+	OPT_BOOLEAN(0, "per-socket", &opt_mode.socket,
+		    "aggregate counts per processor socket"),
+	OPT_BOOLEAN(0, "per-die", &opt_mode.die, "aggregate counts per processor die"),
+	OPT_BOOLEAN(0, "per-cluster", &opt_mode.cluster,
+		    "aggregate counts per processor cluster"),
+	OPT_CALLBACK_OPTARG(0, "per-cache", &opt_mode.cache, &perf_stat.aggr_level,
+			    "cache level", "aggregate count at this cache level (Default: LLC)",
 			    parse_cache_level),
-	OPT_SET_UINT(0, "per-core", &perf_stat.aggr_mode,
-		     "aggregate counts per physical processor core", AGGR_CORE),
-	OPT_SET_UINT(0, "per-node", &perf_stat.aggr_mode,
-		     "aggregate counts per numa node", AGGR_NODE),
-	OPT_SET_UINT('A', "no-aggr", &perf_stat.aggr_mode,
-		     "disable CPU count aggregation", AGGR_NONE),
+	OPT_BOOLEAN(0, "per-core", &opt_mode.core,
+		    "aggregate counts per physical processor core"),
+	OPT_BOOLEAN(0, "per-node", &opt_mode.node, "aggregate counts per numa node"),
+	OPT_BOOLEAN('A', "no-aggr", &opt_mode.no_aggr,
+		    "disable aggregation across CPUs or PMUs"),
 	OPT_END()
 	};
 	struct stat st;
@@ -2330,6 +2329,10 @@ static int __cmd_report(int argc, const char **argv)
 
 	argc = parse_options(argc, argv, options, stat_report_usage, 0);
 
+	perf_stat.aggr_mode = opt_aggr_mode_to_aggr_mode(&opt_mode);
+	if (perf_stat.aggr_mode == AGGR_GLOBAL)
+		perf_stat.aggr_mode = AGGR_UNSET; /* No option found so leave unset. */
+
 	if (!input_name || !strlen(input_name)) {
 		if (!fstat(STDIN_FILENO, &st) && S_ISFIFO(st.st_mode))
 			input_name = "-";
@@ -2506,7 +2509,7 @@ int cmd_stat(int argc, const char **argv)
 		OPT_BOOLEAN(0, "per-die", &opt_mode.die, "aggregate counts per processor die"),
 		OPT_BOOLEAN(0, "per-cluster", &opt_mode.cluster,
 			"aggregate counts per processor cluster"),
-		OPT_CALLBACK_OPTARG(0, "per-cache", &opt_mode, &stat_config.aggr_level,
+		OPT_CALLBACK_OPTARG(0, "per-cache", &opt_mode.cache, &stat_config.aggr_level,
 				"cache level", "aggregate count at this cache level (Default: LLC)",
 				parse_cache_level),
 		OPT_BOOLEAN(0, "per-core", &opt_mode.core,

From e397dd81bc45a991c43a97e010aa3fbe72ac833b Mon Sep 17 00:00:00 2001
From: Stephen Brennan <stephen.s.brennan@oracle.com>
Date: Fri, 20 Mar 2026 16:45:53 -0700
Subject: [PATCH 056/131] perf report: Add comm_nodigit sort key

The "comm" column allows grouping events by the process command. It is
intended to group like programs, despite having different PIDs. But some
workloads may adjust their own command, so that a unique identifier
(e.g. a PID or some other numeric value) is part of the command name.
This destroys the utility of "comm", forcing perf to place each unique
process name into its own bucket, which can contribute to a
combinatorial explosion of memory use in perf report.

Create a less strict version of this column, which ignores digits when
comparing command names. Commands whose names are the same (ignoring
digits) are sorted into the same histogram buckets, and displayed with
the placeholder value "<N>" in the place of digits. For example,
hypothetical command names "kworker/1" "kworker/2" "kworker/3" would
sort into the same bucket and be represented as "kworker/<N>".

Committer testing:

  $ perf report -s comm,comm_nodigit | grep -F "<N>"
       0.01%  CPU 6/TCG        CPU <N>/TCG
       0.01%  kworker/53:2-mm  kworker/<N>:<N>-mm
       0.01%  migration/24     migration/<N>
       0.01%  kworker/24:1-ev  kworker/<N>:<N>-ev
       0.01%  llvmpipe-8       llvmpipe-<N>

Signed-off-by: Stephen Brennan <stephen.s.brennan@oracle.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/Documentation/perf-report.txt |   3 +-
 tools/perf/util/hist.c                   |   3 +
 tools/perf/util/hist.h                   |   2 +
 tools/perf/util/sort.c                   | 114 +++++++++++++++++++++++
 tools/perf/util/sort.h                   |   2 +
 5 files changed, 123 insertions(+), 1 deletion(-)

diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 802f931ae64d..52f316628e43 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -88,7 +88,7 @@ OPTIONS
 	Sort histogram entries by given key(s) - multiple keys can be specified
 	in CSV format.  Following sort keys are available:
 	pid, comm, dso, symbol, parent, cpu, socket, srcline, weight,
-	local_weight, cgroup_id, addr.
+	local_weight, cgroup_id, addr, comm_nodigit.
 
 	Each key has following meaning:
 
@@ -143,6 +143,7 @@ OPTIONS
 	- weight1: Average value of event specific weight (1st field of weight_struct).
 	- weight2: Average value of event specific weight (2nd field of weight_struct).
 	- weight3: Average value of event specific weight (3rd field of weight_struct).
+	- comm_nodigit: same as comm, with numbers replaced by "<N>"
 
 	By default, overhead, comm, dso and symbol keys are used.
 	(i.e. --sort overhead,comm,dso,symbol).
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 7ffaa3d9851b..fc737a0a8e4d 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -110,6 +110,9 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
 	len = thread__comm_len(h->thread);
 	if (hists__new_col_len(hists, HISTC_COMM, len))
 		hists__set_col_len(hists, HISTC_THREAD, len + 8);
+	if (hists->hpp_list->comm_nodigit)
+		hists__new_col_len(hists, HISTC_COMM_NODIGIT,
+				   (u16) sort__comm_nodigit_len(h));
 
 	if (h->ms.map) {
 		len = dso__name_len(map__dso(h->ms.map));
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 1d5ea632ca4e..d97a4efb9250 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -44,6 +44,7 @@ enum hist_column {
 	HISTC_THREAD,
 	HISTC_TGID,
 	HISTC_COMM,
+	HISTC_COMM_NODIGIT,
 	HISTC_CGROUP_ID,
 	HISTC_CGROUP,
 	HISTC_PARENT,
@@ -522,6 +523,7 @@ struct perf_hpp_list {
 	int socket;
 	int thread;
 	int comm;
+	int comm_nodigit;
 };
 
 extern struct perf_hpp_list perf_hpp_list;
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 42d5cd7ef4e2..fda8fcfa46e0 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <ctype.h>
 #include <errno.h>
 #include <inttypes.h>
 #include <regex.h>
@@ -265,6 +266,115 @@ struct sort_entry sort_comm = {
 	.se_width_idx	= HISTC_COMM,
 };
 
+/* --sort comm_nodigit */
+
+size_t sort__comm_nodigit_len(struct hist_entry *entry)
+{
+	const char *comm = comm__str(entry->comm);
+	size_t index, len_nodigit = 0;
+	bool in_number = false;
+
+	if (!comm)
+		return 0;
+
+	for (index = 0; comm[index]; index++) {
+		if (!isdigit((unsigned char)comm[index])) {
+			in_number = false;
+			len_nodigit++;
+		} else if (!in_number) {
+			in_number = true;
+			len_nodigit += 3; /* <N> */
+		}
+	}
+
+	return len_nodigit;
+}
+
+static int64_t strcmp_nodigit(const char *left, const char *right)
+{
+	for (;;) {
+		while (*left && isdigit((unsigned char)*left))
+			left++;
+		while (*right && isdigit((unsigned char)*right))
+			right++;
+		if (*left == *right && !*left) {
+			return 0;
+		} else if (*left == *right) {
+			left++;
+			right++;
+		} else {
+			return (int64_t)((unsigned char)*left - (unsigned char)*right);
+		}
+	}
+}
+
+static int64_t
+sort__comm_nodigit_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	return strcmp_nodigit(comm__str(right->comm), comm__str(left->comm));
+}
+
+static int64_t
+sort__comm_nodigit_collapse(struct hist_entry *left, struct hist_entry *right)
+{
+	return strcmp_nodigit(comm__str(right->comm), comm__str(left->comm));
+}
+
+static int64_t
+sort__comm_nodigit_sort(struct hist_entry *left, struct hist_entry *right)
+{
+	return strcmp_nodigit(comm__str(right->comm), comm__str(left->comm));
+}
+
+static int hist_entry__comm_nodigit_snprintf(struct hist_entry *he, char *bf,
+						size_t size, unsigned int width)
+{
+	int ret = 0;
+	unsigned int print_len, printed = 0, start = 0, end = 0;
+	bool in_digit;
+	const char *comm = comm__str(he->comm), *print;
+
+	while (printed < width && printed < size && comm[start]) {
+		in_digit = !!isdigit((unsigned char)comm[start]);
+		end = start + 1;
+		while (comm[end] && !!isdigit((unsigned char)comm[end]) == in_digit)
+			end++;
+		if (in_digit) {
+			print_len = 3; /* <N> */
+			print = "<N>";
+		} else {
+			print_len = end - start;
+			print = &comm[start];
+		}
+		print_len = min(print_len, width - printed);
+		ret = repsep_snprintf(bf + printed, size - printed, "%-.*s",
+					print_len, print);
+		if (ret < 0)
+			return ret;
+		start = end;
+		printed += ret;
+	}
+	/* Pad to width if necessary */
+	if (printed < width && printed < size) {
+		ret = repsep_snprintf(bf + printed, size - printed, "%-*.*s",
+				       width - printed, width - printed, "");
+		if (ret < 0)
+			return ret;
+		printed += ret;
+	}
+	return printed;
+}
+
+struct sort_entry sort_comm_nodigit = {
+	.se_header	= "CommandNoDigit",
+	.se_cmp		= sort__comm_nodigit_cmp,
+	.se_collapse	= sort__comm_nodigit_collapse,
+	.se_sort	= sort__comm_nodigit_sort,
+	.se_snprintf	= hist_entry__comm_nodigit_snprintf,
+	.se_filter	= hist_entry__thread_filter,
+	.se_width_idx	= HISTC_COMM_NODIGIT,
+};
+
 /* --sort dso */
 
 static int64_t _sort__dso_cmp(struct map *map_l, struct map *map_r)
@@ -2583,6 +2693,7 @@ static struct sort_dimension common_sort_dimensions[] = {
 	DIM(SORT_PID, "pid", sort_thread),
 	DIM(SORT_TGID, "tgid", sort_tgid),
 	DIM(SORT_COMM, "comm", sort_comm),
+	DIM(SORT_COMM_NODIGIT, "comm_nodigit", sort_comm_nodigit),
 	DIM(SORT_DSO, "dso", sort_dso),
 	DIM(SORT_SYM, "symbol", sort_sym),
 	DIM(SORT_PARENT, "parent", sort_parent),
@@ -3579,6 +3690,8 @@ static int __sort_dimension__update(struct sort_dimension *sd,
 		list->thread = 1;
 	} else if (sd->entry == &sort_comm) {
 		list->comm = 1;
+	} else if (sd->entry == &sort_comm_nodigit) {
+		list->comm_nodigit = list->comm = 1;
 	} else if (sd->entry == &sort_type_offset) {
 		symbol_conf.annotate_data_member = true;
 	} else if (sd->entry == &sort_sym_from || sd->entry == &sort_sym_to) {
@@ -4040,6 +4153,7 @@ static bool get_elide(int idx, FILE *output)
 	case HISTC_DSO:
 		return __get_elide(symbol_conf.dso_list, "dso", output);
 	case HISTC_COMM:
+	case HISTC_COMM_NODIGIT:
 		return __get_elide(symbol_conf.comm_list, "comm", output);
 	default:
 		break;
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index d7787958e06b..c962e77e4b93 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -43,6 +43,7 @@ enum sort_type {
 	/* common sort keys */
 	SORT_PID,
 	SORT_COMM,
+	SORT_COMM_NODIGIT,
 	SORT_DSO,
 	SORT_SYM,
 	SORT_PARENT,
@@ -158,4 +159,5 @@ sort__dcacheline_cmp(struct hist_entry *left, struct hist_entry *right);
 int64_t
 _sort__sym_cmp(struct symbol *sym_l, struct symbol *sym_r);
 char *hist_entry__srcline(struct hist_entry *he);
+size_t sort__comm_nodigit_len(struct hist_entry *entry);
 #endif	/* __PERF_SORT_H */

From 74e2dbe7be5037a5e5eed6bc1ad562747ac88566 Mon Sep 17 00:00:00 2001
From: Qinxin Xia <xiaqinxin@huawei.com>
Date: Tue, 10 Mar 2026 12:06:07 +0800
Subject: [PATCH 057/131] perf tools: Add --pmu-filter option for filtering
 PMUs

This patch adds a new --pmu-filter option to perf-stat command to allow
filtering events on specific PMUs. This is useful when there are
multiple PMUs with same type (e.g. hisi_sicl2_cpa0 and hisi_sicl0_cpa0).

[root@localhost tmp]# perf stat -M cpa_p0_avg_bw
 Performance counter stats for 'system wide':

    19,417,779,115      hisi_sicl0_cpa0/cpa_cycles/      #     0.00 cpa_p0_avg_bw
                 0      hisi_sicl0_cpa0/cpa_p0_wr_dat/
                 0      hisi_sicl0_cpa0/cpa_p0_rd_dat_64b/
                 0      hisi_sicl0_cpa0/cpa_p0_rd_dat_32b/
    19,417,751,103      hisi_sicl10_cpa0/cpa_cycles/     #     0.00 cpa_p0_avg_bw
                 0      hisi_sicl10_cpa0/cpa_p0_wr_dat/
                 0      hisi_sicl10_cpa0/cpa_p0_rd_dat_64b/
                 0      hisi_sicl10_cpa0/cpa_p0_rd_dat_32b/
    19,417,730,679      hisi_sicl2_cpa0/cpa_cycles/      #     0.31 cpa_p0_avg_bw
        75,635,749      hisi_sicl2_cpa0/cpa_p0_wr_dat/
        18,520,640      hisi_sicl2_cpa0/cpa_p0_rd_dat_64b/
                 0      hisi_sicl2_cpa0/cpa_p0_rd_dat_32b/
    19,417,674,227      hisi_sicl8_cpa0/cpa_cycles/      #     0.00 cpa_p0_avg_bw
                 0      hisi_sicl8_cpa0/cpa_p0_wr_dat/
                 0      hisi_sicl8_cpa0/cpa_p0_rd_dat_64b/
                 0      hisi_sicl8_cpa0/cpa_p0_rd_dat_32b/

      19.417734480 seconds time elapsed

[root@localhost tmp]# perf stat --pmu-filter hisi_sicl2_cpa0 -M cpa_p0_avg_bw
 Performance counter stats for 'system wide':

     6,234,093,559      cpa_cycles                       #     0.60 cpa_p0_avg_bw
        50,548,465      cpa_p0_wr_dat
         7,552,182      cpa_p0_rd_dat_64b
                 0      cpa_p0_rd_dat_32b

       6.234139320 seconds time elapsed

Signed-off-by: Qinxin Xia <xiaqinxin@huawei.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/Documentation/perf-stat.txt |  4 ++++
 tools/perf/builtin-stat.c              | 19 +++++++++++++++++++
 tools/perf/util/metricgroup.c          | 18 +++++++++++++-----
 tools/perf/util/parse-events.c         |  2 +-
 4 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 7cccc3a847d1..b72a29c9223c 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -578,6 +578,10 @@ $ perf config stat.no-csv-summary=true
 Only enable events on applying cpu with this type for hybrid platform
 (e.g. core or atom)"
 
+--pmu-filter::
+Only enable events on applying pmu with specified for multiple
+pmus with same type (e.g. hisi_sicl2_cpa0 or hisi_sicl0_cpa0)
+
 EXAMPLES
 --------
 
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 2eb76d7476b7..c043a31a2ab0 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1214,6 +1214,21 @@ static int parse_cputype(const struct option *opt,
 	return 0;
 }
 
+static int parse_pmu_filter(const struct option *opt,
+			   const char *str,
+			   int unset __maybe_unused)
+{
+	struct evlist *evlist = *(struct evlist **)opt->value;
+
+	if (!list_empty(&evlist->core.entries)) {
+		fprintf(stderr, "Must define pmu-filter before events/metrics\n");
+		return -1;
+	}
+
+	parse_events_option_args.pmu_filter = str;
+	return 0;
+}
+
 static int parse_cache_level(const struct option *opt,
 			     const char *str,
 			     int unset __maybe_unused)
@@ -2564,6 +2579,10 @@ int cmd_stat(int argc, const char **argv)
 			"Only enable events on applying cpu with this type "
 			"for hybrid platform (e.g. core or atom)",
 			parse_cputype),
+		OPT_CALLBACK(0, "pmu-filter", &evsel_list, "pmu",
+			"Only enable events on applying pmu with specified "
+			"for multiple pmus with same type(e.g. hisi_sicl2_cpa0 or hisi_sicl0_cpa0)",
+			parse_pmu_filter),
 #ifdef HAVE_LIBPFM
 		OPT_CALLBACK(0, "pfm-events", &evsel_list, "event",
 			"libpfm4 event selector. use 'perf list' to list available events",
diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
index 7e39d469111b..f7d53b4e46f4 100644
--- a/tools/perf/util/metricgroup.c
+++ b/tools/perf/util/metricgroup.c
@@ -387,8 +387,13 @@ static bool match_pm_metric_or_groups(const struct pmu_metric *pm, const char *p
 				      const char *metric_or_groups)
 {
 	const char *pm_pmu = pm->pmu ?: "cpu";
+	struct perf_pmu *perf_pmu = NULL;
 
-	if (strcmp(pmu, "all") && strcmp(pm_pmu, pmu))
+	if (pm->pmu)
+		perf_pmu = perf_pmus__find(pm->pmu);
+
+	if (strcmp(pmu, "all") && strcmp(pm_pmu, pmu) &&
+	   (perf_pmu && !perf_pmu__name_wildcard_match(perf_pmu, pmu)))
 		return false;
 
 	return match_metric_or_groups(pm->metric_group, metric_or_groups) ||
@@ -1259,7 +1264,8 @@ err_out:
 static int parse_ids(bool metric_no_merge, bool fake_pmu,
 		     struct expr_parse_ctx *ids, const char *modifier,
 		     bool group_events, const bool tool_events[TOOL_PMU__EVENT_MAX],
-		     struct evlist **out_evlist)
+		     struct evlist **out_evlist,
+		     const char *filter_pmu)
 {
 	struct parse_events_error parse_error;
 	struct evlist *parsed_evlist;
@@ -1313,7 +1319,7 @@ static int parse_ids(bool metric_no_merge, bool fake_pmu,
 	}
 	pr_debug("Parsing metric events '%s'\n", events.buf);
 	parse_events_error__init(&parse_error);
-	ret = __parse_events(parsed_evlist, events.buf, /*pmu_filter=*/NULL,
+	ret = __parse_events(parsed_evlist, events.buf, filter_pmu,
 			     &parse_error, fake_pmu, /*warn_if_reordered=*/false,
 			     /*fake_tp=*/false);
 	if (ret) {
@@ -1416,7 +1422,8 @@ static int parse_groups(struct evlist *perf_evlist,
 					/*modifier=*/NULL,
 					/*group_events=*/false,
 					tool_events,
-					&combined_evlist);
+					&combined_evlist,
+					(pmu && strcmp(pmu, "all") == 0) ? NULL : pmu);
 		}
 		if (combined)
 			expr__ctx_free(combined);
@@ -1471,7 +1478,8 @@ static int parse_groups(struct evlist *perf_evlist,
 		}
 		if (!metric_evlist) {
 			ret = parse_ids(metric_no_merge, fake_pmu, m->pctx, m->modifier,
-					m->group_events, tool_events, &m->evlist);
+					m->group_events, tool_events, &m->evlist,
+					(pmu && strcmp(pmu, "all") == 0) ? NULL : pmu);
 			if (ret)
 				goto out;
 
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 7b4629625b1e..1497e1f2a08c 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -429,7 +429,7 @@ bool parse_events__filter_pmu(const struct parse_events_state *parse_state,
 	if (parse_state->pmu_filter == NULL)
 		return false;
 
-	return strcmp(parse_state->pmu_filter, pmu->name) != 0;
+	return perf_pmu__wildcard_match(pmu, parse_state->pmu_filter) == 0;
 }
 
 static int parse_events_add_pmu(struct parse_events_state *parse_state,

From be867c49fe62d56b5a4c2e08ce47dd396d13714f Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 6 Mar 2026 11:19:08 -0800
Subject: [PATCH 058/131] perf build: Add -funsigned-char to default CFLAGS

Commit 3bc753c06dd0 ("kbuild: treat char as always unsigned") made
chars unsigned by default in the Linux kernel. To avoid similar kinds
of bugs and warnings, make unsigned chars the default for the perf tool.

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/Makefile.config | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 15fbba9f4ca8..333ddd0e4bd8 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -349,6 +349,7 @@ CORE_CFLAGS += -fno-omit-frame-pointer
 CORE_CFLAGS += -Wall
 CORE_CFLAGS += -Wextra
 CORE_CFLAGS += -std=gnu11
+CORE_CFLAGS += -funsigned-char
 
 CXXFLAGS += -std=gnu++17 -fno-exceptions -fno-rtti
 CXXFLAGS += -Wall

From a8e11416ffdcddb3bb3adb265f10b67591d21de8 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 31 Mar 2026 18:06:33 -0300
Subject: [PATCH 059/131] perf beauty: Move tools/include/uapi/drm to
 tools/perf/trace/beauty/include/uapi

As it is used only to parse ioctl numbers, not to build perf and so far
no other tools/ living tool uses it, so to clean up tools/include/ to be
used just for building tools, to have access to things available in the
kernel and not yet in the system headers, move it to the directory where
just the tools/perf/trace/beauty/ scripts can use to generate tables
used by perf.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/Makefile.perf                                  | 2 +-
 tools/perf/check-headers.sh                               | 4 ++--
 tools/perf/trace/beauty/drm_ioctl.sh                      | 2 +-
 tools/{ => perf/trace/beauty}/include/uapi/drm/drm.h      | 0
 tools/{ => perf/trace/beauty}/include/uapi/drm/i915_drm.h | 0
 5 files changed, 4 insertions(+), 4 deletions(-)
 rename tools/{ => perf/trace/beauty}/include/uapi/drm/drm.h (100%)
 rename tools/{ => perf/trace/beauty}/include/uapi/drm/i915_drm.h (100%)

diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index f7b936deeaa2..a560fbc84793 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -547,7 +547,7 @@ $(clone_flags_array): $(beauty_uapi_linux_dir)/sched.h $(clone_flags_tbl)
 	$(Q)$(SHELL) '$(clone_flags_tbl)' $(beauty_uapi_linux_dir) > $@
 
 drm_ioctl_array := $(beauty_ioctl_outdir)/drm_ioctl_array.c
-drm_hdr_dir := $(srctree)/tools/include/uapi/drm
+drm_hdr_dir := $(srctree)/tools/perf/trace/beauty/include/uapi/drm
 drm_ioctl_tbl := $(srctree)/tools/perf/trace/beauty/drm_ioctl.sh
 
 $(drm_ioctl_array): $(drm_hdr_dir)/drm.h $(drm_hdr_dir)/i915_drm.h $(drm_ioctl_tbl)
diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh
index 31826621eebd..c6b136fe8d13 100755
--- a/tools/perf/check-headers.sh
+++ b/tools/perf/check-headers.sh
@@ -6,8 +6,6 @@ NC='\033[0m' # No Color
 
 declare -a FILES=(
   "include/uapi/linux/const.h"
-  "include/uapi/drm/drm.h"
-  "include/uapi/drm/i915_drm.h"
   "include/uapi/linux/bits.h"
   "include/uapi/linux/fadvise.h"
   "include/uapi/linux/fscrypt.h"
@@ -90,6 +88,8 @@ declare -a SYNC_CHECK_FILES=(
 declare -a BEAUTY_FILES=(
   "arch/x86/include/asm/irq_vectors.h"
   "arch/x86/include/uapi/asm/prctl.h"
+  "include/uapi/drm/drm.h"
+  "include/uapi/drm/i915_drm.h"
   "include/linux/socket.h"
   "include/uapi/linux/fcntl.h"
   "include/uapi/linux/fs.h"
diff --git a/tools/perf/trace/beauty/drm_ioctl.sh b/tools/perf/trace/beauty/drm_ioctl.sh
index 9aa94fd523a9..f2f1a257bac8 100755
--- a/tools/perf/trace/beauty/drm_ioctl.sh
+++ b/tools/perf/trace/beauty/drm_ioctl.sh
@@ -1,7 +1,7 @@
 #!/bin/sh
 # SPDX-License-Identifier: LGPL-2.1
 
-[ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/drm/
+[ $# -eq 1 ] && header_dir=$1 || header_dir=tools/perf/trace/beauty/include/uapi/drm/
 
 printf "#ifndef DRM_COMMAND_BASE\n"
 grep "#define DRM_COMMAND_BASE" $header_dir/drm.h
diff --git a/tools/include/uapi/drm/drm.h b/tools/perf/trace/beauty/include/uapi/drm/drm.h
similarity index 100%
rename from tools/include/uapi/drm/drm.h
rename to tools/perf/trace/beauty/include/uapi/drm/drm.h
diff --git a/tools/include/uapi/drm/i915_drm.h b/tools/perf/trace/beauty/include/uapi/drm/i915_drm.h
similarity index 100%
rename from tools/include/uapi/drm/i915_drm.h
rename to tools/perf/trace/beauty/include/uapi/drm/i915_drm.h

From 7f8969aa739da4d2096f2e6f87e030de6efad9dc Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 31 Mar 2026 18:06:34 -0300
Subject: [PATCH 060/131] perf beauty: Move copy of fadvise.h from
 tools/include/ to tools/perf/trace/beauty/include/

As it is not really used when compiling anything, just being parsed to
collect number->string tables for 'perf trace'.

  $ git grep fadvise.h tools/
  tools/perf/Makefile.perf:$(fadvise_advice_array): $(beauty_uapi_linux_dir)/fadvise.h $(fadvise_advice_tbl)
  tools/perf/check-headers.sh:  "include/uapi/linux/fadvise.h"
  tools/perf/trace/beauty/fadvise.sh:grep -E $regex ${header_dir}/fadvise.h | \
  tools/perf/trace/beauty/fadvise.sh:# tools/include/uapi/linux/fadvise.h for details.
  $

Link: https://lore.kernel.org/r/CAP-5=fVBNQVF8k3JUQjH1nkP69ZVp8BqP+uwygcx=xO0zC4xrg@mail.gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/Makefile.perf                                   | 4 ++--
 tools/perf/check-headers.sh                                | 2 +-
 tools/perf/trace/beauty/fadvise.sh                         | 2 +-
 tools/{ => perf/trace/beauty}/include/uapi/linux/fadvise.h | 0
 4 files changed, 4 insertions(+), 4 deletions(-)
 rename tools/{ => perf/trace/beauty}/include/uapi/linux/fadvise.h (100%)

diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index a560fbc84793..cee19c923c06 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -556,8 +556,8 @@ $(drm_ioctl_array): $(drm_hdr_dir)/drm.h $(drm_hdr_dir)/i915_drm.h $(drm_ioctl_t
 fadvise_advice_array := $(beauty_outdir)/fadvise_advice_array.c
 fadvise_advice_tbl := $(srctree)/tools/perf/trace/beauty/fadvise.sh
 
-$(fadvise_advice_array): $(linux_uapi_dir)/in.h $(fadvise_advice_tbl)
-	$(Q)$(SHELL) '$(fadvise_advice_tbl)' $(linux_uapi_dir) > $@
+$(fadvise_advice_array): $(beauty_uapi_linux_dir)/fadvise.h $(fadvise_advice_tbl)
+	$(Q)$(SHELL) '$(fadvise_advice_tbl)' $(beauty_uapi_linux_dir) > $@
 
 fsmount_arrays := $(beauty_outdir)/fsmount_arrays.c
 fsmount_tbls := $(srctree)/tools/perf/trace/beauty/fsmount.sh
diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh
index c6b136fe8d13..531c0e0e84df 100755
--- a/tools/perf/check-headers.sh
+++ b/tools/perf/check-headers.sh
@@ -7,7 +7,6 @@ NC='\033[0m' # No Color
 declare -a FILES=(
   "include/uapi/linux/const.h"
   "include/uapi/linux/bits.h"
-  "include/uapi/linux/fadvise.h"
   "include/uapi/linux/fscrypt.h"
   "include/uapi/linux/genetlink.h"
   "include/uapi/linux/if_addr.h"
@@ -91,6 +90,7 @@ declare -a BEAUTY_FILES=(
   "include/uapi/drm/drm.h"
   "include/uapi/drm/i915_drm.h"
   "include/linux/socket.h"
+  "include/uapi/linux/fadvise.h"
   "include/uapi/linux/fcntl.h"
   "include/uapi/linux/fs.h"
   "include/uapi/linux/mount.h"
diff --git a/tools/perf/trace/beauty/fadvise.sh b/tools/perf/trace/beauty/fadvise.sh
index 4d3dd6e56ded..e9857112fa51 100755
--- a/tools/perf/trace/beauty/fadvise.sh
+++ b/tools/perf/trace/beauty/fadvise.sh
@@ -1,7 +1,7 @@
 #!/bin/sh
 # SPDX-License-Identifier: LGPL-2.1
 
-[ $# -eq 1 ] && header_dir=$1 || header_dir=tools/include/uapi/linux/
+[ $# -eq 1 ] && header_dir=$1 || header_dir=tools/perf/trace/beauty/include/uapi/linux/
 
 printf "static const char *fadvise_advices[] = {\n"
 regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+POSIX_FADV_(\w+)[[:space:]]+([[:digit:]]+)[[:space:]]+.*'
diff --git a/tools/include/uapi/linux/fadvise.h b/tools/perf/trace/beauty/include/uapi/linux/fadvise.h
similarity index 100%
rename from tools/include/uapi/linux/fadvise.h
rename to tools/perf/trace/beauty/include/uapi/linux/fadvise.h

From 83c338369a88eeab8cc64446c7ba9bb8ffb37e4a Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 31 Mar 2026 11:29:48 -0700
Subject: [PATCH 061/131] libperf cpumap: Make index and nr types unsigned

The index into the cpumap array and the number of entries within the
array can never be negative, so let's make them unsigned. This is
prompted by reports that gcc 13 with -O6 is giving a
alloc-size-larger-than errors. The change makes the cpumap changes and
then updates the declaration of index variables throughout perf and
libperf to be unsigned. The two things are hard to separate as
compiler warnings about mixing signed and unsigned types breaks the
build.

Reported-by: Chingbin Li <liqb365@163.com>
Closes: https://lore.kernel.org/lkml/20260212025127.841090-1-liqb365@163.com/
Tested-by: Chingbin Li <liqb365@163.com>
Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/lib/perf/cpumap.c                       | 49 +++++++++----------
 tools/lib/perf/evsel.c                        | 10 ++--
 tools/lib/perf/include/internal/cpumap.h      |  6 +--
 tools/lib/perf/include/perf/cpumap.h          |  4 +-
 tools/perf/arch/arm/util/cs-etm.c             |  7 +--
 tools/perf/arch/arm64/util/arm-spe.c          |  3 +-
 tools/perf/arch/arm64/util/header.c           |  2 +-
 tools/perf/arch/x86/util/pmu.c                |  3 +-
 tools/perf/builtin-c2c.c                      |  6 +--
 tools/perf/builtin-record.c                   |  2 +-
 tools/perf/builtin-script.c                   |  5 +-
 tools/perf/builtin-stat.c                     |  2 +-
 tools/perf/tests/bitmap.c                     |  2 +-
 tools/perf/tests/cpumap.c                     |  6 ++-
 tools/perf/tests/mem2node.c                   |  2 +-
 tools/perf/tests/openat-syscall-all-cpus.c    |  3 +-
 tools/perf/tests/topology.c                   |  4 +-
 tools/perf/util/affinity.c                    |  2 +-
 tools/perf/util/bpf_counter.c                 | 24 ++++-----
 tools/perf/util/bpf_counter_cgroup.c          |  8 +--
 tools/perf/util/bpf_kwork.c                   |  3 +-
 tools/perf/util/bpf_kwork_top.c               |  3 +-
 tools/perf/util/bpf_off_cpu.c                 |  2 +-
 tools/perf/util/bpf_trace_augment.c           |  2 +-
 tools/perf/util/cpumap.c                      | 10 ++--
 tools/perf/util/cputopo.c                     |  2 +-
 tools/perf/util/env.c                         |  2 +-
 .../scripting-engines/trace-event-python.c    |  2 +-
 tools/perf/util/session.c                     |  3 +-
 tools/perf/util/stat-display.c                |  4 +-
 tools/perf/util/stat.c                        |  8 +--
 tools/perf/util/svghelper.c                   |  3 +-
 tools/perf/util/symbol.c                      |  3 +-
 tools/perf/util/synthetic-events.c            |  2 +-
 34 files changed, 108 insertions(+), 91 deletions(-)

diff --git a/tools/lib/perf/cpumap.c b/tools/lib/perf/cpumap.c
index 4160e7d2e120..e51b0490ad57 100644
--- a/tools/lib/perf/cpumap.c
+++ b/tools/lib/perf/cpumap.c
@@ -15,12 +15,12 @@
 
 #define MAX_NR_CPUS 4096
 
-void perf_cpu_map__set_nr(struct perf_cpu_map *map, int nr_cpus)
+void perf_cpu_map__set_nr(struct perf_cpu_map *map, unsigned int nr_cpus)
 {
 	RC_CHK_ACCESS(map)->nr = nr_cpus;
 }
 
-struct perf_cpu_map *perf_cpu_map__alloc(int nr_cpus)
+struct perf_cpu_map *perf_cpu_map__alloc(unsigned int nr_cpus)
 {
 	RC_STRUCT(perf_cpu_map) *cpus;
 	struct perf_cpu_map *result;
@@ -78,7 +78,7 @@ void perf_cpu_map__put(struct perf_cpu_map *map)
 static struct perf_cpu_map *cpu_map__new_sysconf(void)
 {
 	struct perf_cpu_map *cpus;
-	int nr_cpus, nr_cpus_conf;
+	long nr_cpus, nr_cpus_conf;
 
 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
 	if (nr_cpus < 0)
@@ -86,15 +86,13 @@ static struct perf_cpu_map *cpu_map__new_sysconf(void)
 
 	nr_cpus_conf = sysconf(_SC_NPROCESSORS_CONF);
 	if (nr_cpus != nr_cpus_conf) {
-		pr_warning("Number of online CPUs (%d) differs from the number configured (%d) the CPU map will only cover the first %d CPUs.",
+		pr_warning("Number of online CPUs (%ld) differs from the number configured (%ld) the CPU map will only cover the first %ld CPUs.",
 			nr_cpus, nr_cpus_conf, nr_cpus);
 	}
 
 	cpus = perf_cpu_map__alloc(nr_cpus);
 	if (cpus != NULL) {
-		int i;
-
-		for (i = 0; i < nr_cpus; ++i)
+		for (long i = 0; i < nr_cpus; ++i)
 			RC_CHK_ACCESS(cpus)->map[i].cpu = i;
 	}
 
@@ -132,23 +130,23 @@ static int cmp_cpu(const void *a, const void *b)
 	return cpu_a->cpu - cpu_b->cpu;
 }
 
-static struct perf_cpu __perf_cpu_map__cpu(const struct perf_cpu_map *cpus, int idx)
+static struct perf_cpu __perf_cpu_map__cpu(const struct perf_cpu_map *cpus, unsigned int idx)
 {
 	return RC_CHK_ACCESS(cpus)->map[idx];
 }
 
-static struct perf_cpu_map *cpu_map__trim_new(int nr_cpus, const struct perf_cpu *tmp_cpus)
+static struct perf_cpu_map *cpu_map__trim_new(unsigned int nr_cpus, const struct perf_cpu *tmp_cpus)
 {
 	size_t payload_size = nr_cpus * sizeof(struct perf_cpu);
 	struct perf_cpu_map *cpus = perf_cpu_map__alloc(nr_cpus);
-	int i, j;
 
 	if (cpus != NULL) {
+		unsigned int j = 0;
+
 		memcpy(RC_CHK_ACCESS(cpus)->map, tmp_cpus, payload_size);
 		qsort(RC_CHK_ACCESS(cpus)->map, nr_cpus, sizeof(struct perf_cpu), cmp_cpu);
 		/* Remove dups */
-		j = 0;
-		for (i = 0; i < nr_cpus; i++) {
+		for (unsigned int i = 0; i < nr_cpus; i++) {
 			if (i == 0 ||
 			    __perf_cpu_map__cpu(cpus, i).cpu !=
 			    __perf_cpu_map__cpu(cpus, i - 1).cpu) {
@@ -167,9 +165,8 @@ struct perf_cpu_map *perf_cpu_map__new(const char *cpu_list)
 	struct perf_cpu_map *cpus = NULL;
 	unsigned long start_cpu, end_cpu = 0;
 	char *p = NULL;
-	int i, nr_cpus = 0;
+	unsigned int nr_cpus = 0, max_entries = 0;
 	struct perf_cpu *tmp_cpus = NULL, *tmp;
-	int max_entries = 0;
 
 	if (!cpu_list)
 		return perf_cpu_map__new_online_cpus();
@@ -208,9 +205,10 @@ struct perf_cpu_map *perf_cpu_map__new(const char *cpu_list)
 
 		for (; start_cpu <= end_cpu; start_cpu++) {
 			/* check for duplicates */
-			for (i = 0; i < nr_cpus; i++)
+			for (unsigned int i = 0; i < nr_cpus; i++) {
 				if (tmp_cpus[i].cpu == (int16_t)start_cpu)
 					goto invalid;
+			}
 
 			if (nr_cpus == max_entries) {
 				max_entries += max(end_cpu - start_cpu + 1, 16UL);
@@ -252,12 +250,12 @@ struct perf_cpu_map *perf_cpu_map__new_int(int cpu)
 	return cpus;
 }
 
-static int __perf_cpu_map__nr(const struct perf_cpu_map *cpus)
+static unsigned int __perf_cpu_map__nr(const struct perf_cpu_map *cpus)
 {
 	return RC_CHK_ACCESS(cpus)->nr;
 }
 
-struct perf_cpu perf_cpu_map__cpu(const struct perf_cpu_map *cpus, int idx)
+struct perf_cpu perf_cpu_map__cpu(const struct perf_cpu_map *cpus, unsigned int idx)
 {
 	struct perf_cpu result = {
 		.cpu = -1
@@ -269,7 +267,7 @@ struct perf_cpu perf_cpu_map__cpu(const struct perf_cpu_map *cpus, int idx)
 	return result;
 }
 
-int perf_cpu_map__nr(const struct perf_cpu_map *cpus)
+unsigned int perf_cpu_map__nr(const struct perf_cpu_map *cpus)
 {
 	return cpus ? __perf_cpu_map__nr(cpus) : 1;
 }
@@ -294,7 +292,7 @@ bool perf_cpu_map__is_empty(const struct perf_cpu_map *map)
 
 int perf_cpu_map__idx(const struct perf_cpu_map *cpus, struct perf_cpu cpu)
 {
-	int low, high;
+	unsigned int low, high;
 
 	if (!cpus)
 		return -1;
@@ -324,7 +322,7 @@ bool perf_cpu_map__has(const struct perf_cpu_map *cpus, struct perf_cpu cpu)
 
 bool perf_cpu_map__equal(const struct perf_cpu_map *lhs, const struct perf_cpu_map *rhs)
 {
-	int nr;
+	unsigned int nr;
 
 	if (lhs == rhs)
 		return true;
@@ -336,7 +334,7 @@ bool perf_cpu_map__equal(const struct perf_cpu_map *lhs, const struct perf_cpu_m
 	if (nr != __perf_cpu_map__nr(rhs))
 		return false;
 
-	for (int idx = 0; idx < nr; idx++) {
+	for (unsigned int idx = 0; idx < nr; idx++) {
 		if (__perf_cpu_map__cpu(lhs, idx).cpu != __perf_cpu_map__cpu(rhs, idx).cpu)
 			return false;
 	}
@@ -353,7 +351,7 @@ struct perf_cpu perf_cpu_map__min(const struct perf_cpu_map *map)
 	struct perf_cpu cpu, result = {
 		.cpu = -1
 	};
-	int idx;
+	unsigned int idx;
 
 	perf_cpu_map__for_each_cpu_skip_any(cpu, idx, map) {
 		result = cpu;
@@ -384,7 +382,7 @@ bool perf_cpu_map__is_subset(const struct perf_cpu_map *a, const struct perf_cpu
 	if (!a || __perf_cpu_map__nr(b) > __perf_cpu_map__nr(a))
 		return false;
 
-	for (int i = 0, j = 0; i < __perf_cpu_map__nr(a); i++) {
+	for (unsigned int i = 0, j = 0; i < __perf_cpu_map__nr(a); i++) {
 		if (__perf_cpu_map__cpu(a, i).cpu > __perf_cpu_map__cpu(b, j).cpu)
 			return false;
 		if (__perf_cpu_map__cpu(a, i).cpu == __perf_cpu_map__cpu(b, j).cpu) {
@@ -410,8 +408,7 @@ bool perf_cpu_map__is_subset(const struct perf_cpu_map *a, const struct perf_cpu
 int perf_cpu_map__merge(struct perf_cpu_map **orig, struct perf_cpu_map *other)
 {
 	struct perf_cpu *tmp_cpus;
-	int tmp_len;
-	int i, j, k;
+	unsigned int tmp_len, i, j, k;
 	struct perf_cpu_map *merged;
 
 	if (perf_cpu_map__is_subset(*orig, other))
@@ -455,7 +452,7 @@ int perf_cpu_map__merge(struct perf_cpu_map **orig, struct perf_cpu_map *other)
 struct perf_cpu_map *perf_cpu_map__intersect(struct perf_cpu_map *orig,
 					     struct perf_cpu_map *other)
 {
-	int i, j, k;
+	unsigned int i, j, k;
 	struct perf_cpu_map *merged;
 
 	if (perf_cpu_map__is_subset(other, orig))
diff --git a/tools/lib/perf/evsel.c b/tools/lib/perf/evsel.c
index 13a307fc75ae..f747c0bc692d 100644
--- a/tools/lib/perf/evsel.c
+++ b/tools/lib/perf/evsel.c
@@ -127,7 +127,8 @@ int perf_evsel__open(struct perf_evsel *evsel, struct perf_cpu_map *cpus,
 		     struct perf_thread_map *threads)
 {
 	struct perf_cpu cpu;
-	int idx, thread, err = 0;
+	unsigned int idx;
+	int thread, err = 0;
 
 	if (cpus == NULL) {
 		static struct perf_cpu_map *empty_cpu_map;
@@ -460,7 +461,7 @@ int perf_evsel__enable_cpu(struct perf_evsel *evsel, int cpu_map_idx)
 int perf_evsel__enable_thread(struct perf_evsel *evsel, int thread)
 {
 	struct perf_cpu cpu __maybe_unused;
-	int idx;
+	unsigned int idx;
 	int err;
 
 	perf_cpu_map__for_each_cpu(cpu, idx, evsel->cpus) {
@@ -499,12 +500,13 @@ int perf_evsel__disable(struct perf_evsel *evsel)
 
 int perf_evsel__apply_filter(struct perf_evsel *evsel, const char *filter)
 {
-	int err = 0, i;
+	int err = 0;
 
-	for (i = 0; i < perf_cpu_map__nr(evsel->cpus) && !err; i++)
+	for (unsigned int i = 0; i < perf_cpu_map__nr(evsel->cpus) && !err; i++) {
 		err = perf_evsel__run_ioctl(evsel,
 				     PERF_EVENT_IOC_SET_FILTER,
 				     (void *)filter, i);
+	}
 	return err;
 }
 
diff --git a/tools/lib/perf/include/internal/cpumap.h b/tools/lib/perf/include/internal/cpumap.h
index e2be2d17c32b..c19678188b17 100644
--- a/tools/lib/perf/include/internal/cpumap.h
+++ b/tools/lib/perf/include/internal/cpumap.h
@@ -16,16 +16,16 @@
 DECLARE_RC_STRUCT(perf_cpu_map) {
 	refcount_t	refcnt;
 	/** Length of the map array. */
-	int		nr;
+	unsigned int	nr;
 	/** The CPU values. */
 	struct perf_cpu	map[];
 };
 
-struct perf_cpu_map *perf_cpu_map__alloc(int nr_cpus);
+struct perf_cpu_map *perf_cpu_map__alloc(unsigned int nr_cpus);
 int perf_cpu_map__idx(const struct perf_cpu_map *cpus, struct perf_cpu cpu);
 bool perf_cpu_map__is_subset(const struct perf_cpu_map *a, const struct perf_cpu_map *b);
 
-void perf_cpu_map__set_nr(struct perf_cpu_map *map, int nr_cpus);
+void perf_cpu_map__set_nr(struct perf_cpu_map *map, unsigned int nr_cpus);
 
 static inline refcount_t *perf_cpu_map__refcnt(struct perf_cpu_map *map)
 {
diff --git a/tools/lib/perf/include/perf/cpumap.h b/tools/lib/perf/include/perf/cpumap.h
index 58cc5c5fa47c..a1dd25db65b6 100644
--- a/tools/lib/perf/include/perf/cpumap.h
+++ b/tools/lib/perf/include/perf/cpumap.h
@@ -49,7 +49,7 @@ LIBPERF_API void perf_cpu_map__put(struct perf_cpu_map *map);
  * perf_cpu_map__cpu - get the CPU value at the given index. Returns -1 if index
  *                     is invalid.
  */
-LIBPERF_API struct perf_cpu perf_cpu_map__cpu(const struct perf_cpu_map *cpus, int idx);
+LIBPERF_API struct perf_cpu perf_cpu_map__cpu(const struct perf_cpu_map *cpus, unsigned int idx);
 /**
  * perf_cpu_map__nr - for an empty map returns 1, as perf_cpu_map__cpu returns a
  *                    cpu of -1 for an invalid index, this makes an empty map
@@ -57,7 +57,7 @@ LIBPERF_API struct perf_cpu perf_cpu_map__cpu(const struct perf_cpu_map *cpus, i
  *                    the result is the number CPUs in the map plus one if the
  *                    "any CPU"/dummy value is present.
  */
-LIBPERF_API int perf_cpu_map__nr(const struct perf_cpu_map *cpus);
+LIBPERF_API unsigned int perf_cpu_map__nr(const struct perf_cpu_map *cpus);
 /**
  * perf_cpu_map__has_any_cpu_or_is_empty - is map either empty or has the "any CPU"/dummy value.
  */
diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c
index 4418d21708d6..b7a839de8707 100644
--- a/tools/perf/arch/arm/util/cs-etm.c
+++ b/tools/perf/arch/arm/util/cs-etm.c
@@ -197,7 +197,8 @@ static struct perf_pmu *cs_etm_get_pmu(struct auxtrace_record *itr)
 static int cs_etm_validate_config(struct perf_pmu *cs_etm_pmu,
 				  struct evsel *evsel)
 {
-	int idx, err = 0;
+	unsigned int idx;
+	int err = 0;
 	struct perf_cpu_map *event_cpus = evsel->evlist->core.user_requested_cpus;
 	struct perf_cpu_map *intersect_cpus;
 	struct perf_cpu cpu;
@@ -546,7 +547,7 @@ static size_t
 cs_etm_info_priv_size(struct auxtrace_record *itr,
 		      struct evlist *evlist)
 {
-	int idx;
+	unsigned int idx;
 	int etmv3 = 0, etmv4 = 0, ete = 0;
 	struct perf_cpu_map *event_cpus = evlist->core.user_requested_cpus;
 	struct perf_cpu_map *intersect_cpus;
@@ -783,7 +784,7 @@ static int cs_etm_info_fill(struct auxtrace_record *itr,
 			    struct perf_record_auxtrace_info *info,
 			    size_t priv_size)
 {
-	int i;
+	unsigned int i;
 	u32 offset;
 	u64 nr_cpu, type;
 	struct perf_cpu_map *cpu_map;
diff --git a/tools/perf/arch/arm64/util/arm-spe.c b/tools/perf/arch/arm64/util/arm-spe.c
index 17ced7bbbdda..f00d72d087fc 100644
--- a/tools/perf/arch/arm64/util/arm-spe.c
+++ b/tools/perf/arch/arm64/util/arm-spe.c
@@ -144,7 +144,8 @@ static int arm_spe_info_fill(struct auxtrace_record *itr,
 			     struct perf_record_auxtrace_info *auxtrace_info,
 			     size_t priv_size)
 {
-	int i, ret;
+	unsigned int i;
+	int ret;
 	size_t offset;
 	struct arm_spe_recording *sper =
 			container_of(itr, struct arm_spe_recording, itr);
diff --git a/tools/perf/arch/arm64/util/header.c b/tools/perf/arch/arm64/util/header.c
index cbc0ba101636..95e71c4f6c78 100644
--- a/tools/perf/arch/arm64/util/header.c
+++ b/tools/perf/arch/arm64/util/header.c
@@ -43,7 +43,7 @@ static int _get_cpuid(char *buf, size_t sz, struct perf_cpu cpu)
 int get_cpuid(char *buf, size_t sz, struct perf_cpu cpu)
 {
 	struct perf_cpu_map *cpus;
-	int idx;
+	unsigned int idx;
 
 	if (cpu.cpu != -1)
 		return _get_cpuid(buf, sz, cpu);
diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c
index 4ea4d022c9c3..0661e0f0b02d 100644
--- a/tools/perf/arch/x86/util/pmu.c
+++ b/tools/perf/arch/x86/util/pmu.c
@@ -221,7 +221,8 @@ static void gnr_uncore_cha_imc_adjust_cpumask_for_snc(struct perf_pmu *pmu, bool
 	static struct perf_cpu_map *cha_adjusted[MAX_SNCS];
 	static struct perf_cpu_map *imc_adjusted[MAX_SNCS];
 	struct perf_cpu_map **adjusted = cha ? cha_adjusted : imc_adjusted;
-	int idx, pmu_snc, cpu_adjust;
+	unsigned int idx;
+	int pmu_snc, cpu_adjust;
 	struct perf_cpu cpu;
 	bool alloc;
 
diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c
index d390ae4e3ec8..e60eea62c2fc 100644
--- a/tools/perf/builtin-c2c.c
+++ b/tools/perf/builtin-c2c.c
@@ -2310,7 +2310,6 @@ static int setup_nodes(struct perf_session *session)
 {
 	struct numa_node *n;
 	unsigned long **nodes;
-	int node, idx;
 	struct perf_cpu cpu;
 	int *cpu2node;
 	struct perf_env *env = perf_session__env(session);
@@ -2335,14 +2334,15 @@ static int setup_nodes(struct perf_session *session)
 	if (!cpu2node)
 		return -ENOMEM;
 
-	for (idx = 0; idx < c2c.cpus_cnt; idx++)
+	for (int idx = 0; idx < c2c.cpus_cnt; idx++)
 		cpu2node[idx] = -1;
 
 	c2c.cpu2node = cpu2node;
 
-	for (node = 0; node < c2c.nodes_cnt; node++) {
+	for (int node = 0; node < c2c.nodes_cnt; node++) {
 		struct perf_cpu_map *map = n[node].map;
 		unsigned long *set;
+		unsigned int idx;
 
 		set = bitmap_zalloc(c2c.cpus_cnt);
 		if (!set)
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 3276ffdc3141..e919d1f021c3 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -3663,7 +3663,7 @@ struct option *record_options = __record_options;
 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
 {
 	struct perf_cpu cpu;
-	int idx;
+	unsigned int idx;
 
 	if (cpu_map__is_dummy(cpus))
 		return 0;
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index b80c406d1fc1..b005b23f9d8c 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -2572,7 +2572,6 @@ static struct scripting_ops	*scripting_ops;
 static void __process_stat(struct evsel *counter, u64 tstamp)
 {
 	int nthreads = perf_thread_map__nr(counter->core.threads);
-	int idx, thread;
 	struct perf_cpu cpu;
 	static int header_printed;
 
@@ -2582,7 +2581,9 @@ static void __process_stat(struct evsel *counter, u64 tstamp)
 		header_printed = 1;
 	}
 
-	for (thread = 0; thread < nthreads; thread++) {
+	for (int thread = 0; thread < nthreads; thread++) {
+		unsigned int idx;
+
 		perf_cpu_map__for_each_cpu(cpu, idx, evsel__cpus(counter)) {
 			struct perf_counts_values *counts;
 
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index c043a31a2ab0..a24326c44297 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -410,7 +410,7 @@ static int read_tool_counters(void)
 	struct evsel *counter;
 
 	evlist__for_each_entry(evsel_list, counter) {
-		int idx;
+		unsigned int idx;
 
 		if (!evsel__is_tool(counter))
 			continue;
diff --git a/tools/perf/tests/bitmap.c b/tools/perf/tests/bitmap.c
index 98956e0e0765..e7adf60be721 100644
--- a/tools/perf/tests/bitmap.c
+++ b/tools/perf/tests/bitmap.c
@@ -16,7 +16,7 @@ static unsigned long *get_bitmap(const char *str, int nbits)
 	bm = bitmap_zalloc(nbits);
 
 	if (map && bm) {
-		int i;
+		unsigned int i;
 		struct perf_cpu cpu;
 
 		perf_cpu_map__for_each_cpu(cpu, i, map)
diff --git a/tools/perf/tests/cpumap.c b/tools/perf/tests/cpumap.c
index 2354246afc5a..b051dce2cd86 100644
--- a/tools/perf/tests/cpumap.c
+++ b/tools/perf/tests/cpumap.c
@@ -156,7 +156,8 @@ static int test__cpu_map_print(struct test_suite *test __maybe_unused, int subte
 	return 0;
 }
 
-static int __test__cpu_map_merge(const char *lhs, const char *rhs, int nr, const char *expected)
+static int __test__cpu_map_merge(const char *lhs, const char *rhs, unsigned int nr,
+				 const char *expected)
 {
 	struct perf_cpu_map *a = perf_cpu_map__new(lhs);
 	struct perf_cpu_map *b = perf_cpu_map__new(rhs);
@@ -204,7 +205,8 @@ static int test__cpu_map_merge(struct test_suite *test __maybe_unused,
 	return ret;
 }
 
-static int __test__cpu_map_intersect(const char *lhs, const char *rhs, int nr, const char *expected)
+static int __test__cpu_map_intersect(const char *lhs, const char *rhs, unsigned int nr,
+				     const char *expected)
 {
 	struct perf_cpu_map *a = perf_cpu_map__new(lhs);
 	struct perf_cpu_map *b = perf_cpu_map__new(rhs);
diff --git a/tools/perf/tests/mem2node.c b/tools/perf/tests/mem2node.c
index a0e88c496107..7ce1ad7b6ce5 100644
--- a/tools/perf/tests/mem2node.c
+++ b/tools/perf/tests/mem2node.c
@@ -30,7 +30,7 @@ static unsigned long *get_bitmap(const char *str, int nbits)
 
 	if (map && bm) {
 		struct perf_cpu cpu;
-		int i;
+		unsigned int i;
 
 		perf_cpu_map__for_each_cpu(cpu, i, map)
 			__set_bit(cpu.cpu, bm);
diff --git a/tools/perf/tests/openat-syscall-all-cpus.c b/tools/perf/tests/openat-syscall-all-cpus.c
index 3644d6f52c07..0be43f8db3bd 100644
--- a/tools/perf/tests/openat-syscall-all-cpus.c
+++ b/tools/perf/tests/openat-syscall-all-cpus.c
@@ -22,7 +22,8 @@
 static int test__openat_syscall_event_on_all_cpus(struct test_suite *test __maybe_unused,
 						  int subtest __maybe_unused)
 {
-	int err = TEST_FAIL, fd, idx;
+	int err = TEST_FAIL, fd;
+	unsigned int idx;
 	struct perf_cpu cpu;
 	struct perf_cpu_map *cpus;
 	struct evsel *evsel;
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index a34a7ab19a80..75b748ddf824 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -69,7 +69,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map)
 		.path = path,
 		.mode = PERF_DATA_MODE_READ,
 	};
-	int i;
+	unsigned int i;
 	struct aggr_cpu_id id;
 	struct perf_cpu cpu;
 	struct perf_env *env;
@@ -116,7 +116,7 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map)
 
 	TEST_ASSERT_VAL("Session header CPU map not set", env->cpu);
 
-	for (i = 0; i < env->nr_cpus_avail; i++) {
+	for (i = 0; i < (unsigned int)env->nr_cpus_avail; i++) {
 		cpu.cpu = i;
 		if (!perf_cpu_map__has(map, cpu))
 			continue;
diff --git a/tools/perf/util/affinity.c b/tools/perf/util/affinity.c
index 4fe851334296..6c64b5f69a4e 100644
--- a/tools/perf/util/affinity.c
+++ b/tools/perf/util/affinity.c
@@ -90,7 +90,7 @@ void cpu_map__set_affinity(const struct perf_cpu_map *cpumap)
 	int cpu_set_size = get_cpu_set_size();
 	unsigned long *cpuset = bitmap_zalloc(cpu_set_size * 8);
 	struct perf_cpu cpu;
-	int idx;
+	unsigned int idx;
 
 	if (!cpuset)
 		return;
diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c
index a5882b582205..2ffd7aefb6eb 100644
--- a/tools/perf/util/bpf_counter.c
+++ b/tools/perf/util/bpf_counter.c
@@ -294,7 +294,8 @@ static int bpf_program_profiler__read(struct evsel *evsel)
 	struct perf_counts_values *counts;
 	int reading_map_fd;
 	__u32 key = 0;
-	int err, idx, bpf_cpu;
+	int err, bpf_cpu;
+	unsigned int idx;
 
 	if (list_empty(&evsel->bpf_counter_list))
 		return -EAGAIN;
@@ -318,11 +319,12 @@ static int bpf_program_profiler__read(struct evsel *evsel)
 		}
 
 		for (bpf_cpu = 0; bpf_cpu < num_cpu_bpf; bpf_cpu++) {
-			idx = perf_cpu_map__idx(evsel__cpus(evsel),
-						(struct perf_cpu){.cpu = bpf_cpu});
-			if (idx == -1)
+			int i = perf_cpu_map__idx(evsel__cpus(evsel),
+						  (struct perf_cpu){.cpu = bpf_cpu});
+
+			if (i == -1)
 				continue;
-			counts = perf_counts(evsel->counts, idx, 0);
+			counts = perf_counts(evsel->counts, i, 0);
 			counts->val += values[bpf_cpu].counter;
 			counts->ena += values[bpf_cpu].enabled;
 			counts->run += values[bpf_cpu].running;
@@ -668,7 +670,7 @@ static int bperf__install_pe(struct evsel *evsel, int cpu_map_idx, int fd)
 static int bperf_sync_counters(struct evsel *evsel)
 {
 	struct perf_cpu cpu;
-	int idx;
+	unsigned int idx;
 
 	perf_cpu_map__for_each_cpu(cpu, idx, evsel->core.cpus)
 		bperf_trigger_reading(evsel->bperf_leader_prog_fd, cpu.cpu);
@@ -695,13 +697,11 @@ static int bperf__read(struct evsel *evsel)
 	struct bpf_perf_event_value values[num_cpu_bpf];
 	struct perf_counts_values *counts;
 	int reading_map_fd, err = 0;
-	__u32 i;
-	int j;
 
 	bperf_sync_counters(evsel);
 	reading_map_fd = bpf_map__fd(skel->maps.accum_readings);
 
-	for (i = 0; i < filter_entry_cnt; i++) {
+	for (__u32 i = 0; i < filter_entry_cnt; i++) {
 		struct perf_cpu entry;
 		__u32 cpu;
 
@@ -709,9 +709,10 @@ static int bperf__read(struct evsel *evsel)
 		if (err)
 			goto out;
 		switch (evsel->follower_skel->bss->type) {
-		case BPERF_FILTER_GLOBAL:
-			assert(i == 0);
+		case BPERF_FILTER_GLOBAL: {
+			unsigned int j;
 
+			assert(i == 0);
 			perf_cpu_map__for_each_cpu(entry, j, evsel__cpus(evsel)) {
 				counts = perf_counts(evsel->counts, j, 0);
 				counts->val = values[entry.cpu].counter;
@@ -719,6 +720,7 @@ static int bperf__read(struct evsel *evsel)
 				counts->run = values[entry.cpu].running;
 			}
 			break;
+		}
 		case BPERF_FILTER_CPU:
 			cpu = perf_cpu_map__cpu(evsel__cpus(evsel), i).cpu;
 			assert(cpu >= 0);
diff --git a/tools/perf/util/bpf_counter_cgroup.c b/tools/perf/util/bpf_counter_cgroup.c
index 17d7196c6589..5572ceccf860 100644
--- a/tools/perf/util/bpf_counter_cgroup.c
+++ b/tools/perf/util/bpf_counter_cgroup.c
@@ -98,7 +98,7 @@ static int bperf_load_program(struct evlist *evlist)
 	struct bpf_link *link;
 	struct evsel *evsel;
 	struct cgroup *cgrp, *leader_cgrp;
-	int i, j;
+	unsigned int i;
 	struct perf_cpu cpu;
 	int total_cpus = cpu__max_cpu().cpu;
 	int map_fd, prog_fd, err;
@@ -146,6 +146,8 @@ static int bperf_load_program(struct evlist *evlist)
 
 	evlist__for_each_entry(evlist, evsel) {
 		if (cgrp == NULL || evsel->cgrp == leader_cgrp) {
+			unsigned int j;
+
 			leader_cgrp = evsel->cgrp;
 			evsel->cgrp = NULL;
 
@@ -234,7 +236,7 @@ static int bperf_cgrp__install_pe(struct evsel *evsel __maybe_unused,
 static int bperf_cgrp__sync_counters(struct evlist *evlist)
 {
 	struct perf_cpu cpu;
-	int idx;
+	unsigned int idx;
 	int prog_fd = bpf_program__fd(skel->progs.trigger_read);
 
 	perf_cpu_map__for_each_cpu(cpu, idx, evlist->core.all_cpus)
@@ -286,7 +288,7 @@ static int bperf_cgrp__read(struct evsel *evsel)
 
 	evlist__for_each_entry(evlist, evsel) {
 		__u32 idx = evsel->core.idx;
-		int i;
+		unsigned int i;
 		struct perf_cpu cpu;
 
 		err = bpf_map_lookup_elem(reading_map_fd, &idx, values);
diff --git a/tools/perf/util/bpf_kwork.c b/tools/perf/util/bpf_kwork.c
index 5cff755c71fa..d3a2e548f2b6 100644
--- a/tools/perf/util/bpf_kwork.c
+++ b/tools/perf/util/bpf_kwork.c
@@ -148,7 +148,8 @@ static bool valid_kwork_class_type(enum kwork_class_type type)
 static int setup_filters(struct perf_kwork *kwork)
 {
 	if (kwork->cpu_list != NULL) {
-		int idx, nr_cpus;
+		unsigned int idx;
+		int nr_cpus;
 		struct perf_cpu_map *map;
 		struct perf_cpu cpu;
 		int fd = bpf_map__fd(skel->maps.perf_kwork_cpu_filter);
diff --git a/tools/perf/util/bpf_kwork_top.c b/tools/perf/util/bpf_kwork_top.c
index b6f187dd9136..189a29d2bc96 100644
--- a/tools/perf/util/bpf_kwork_top.c
+++ b/tools/perf/util/bpf_kwork_top.c
@@ -123,7 +123,8 @@ static bool valid_kwork_class_type(enum kwork_class_type type)
 static int setup_filters(struct perf_kwork *kwork)
 {
 	if (kwork->cpu_list) {
-		int idx, nr_cpus, fd;
+		unsigned int idx;
+		int nr_cpus, fd;
 		struct perf_cpu_map *map;
 		struct perf_cpu cpu;
 
diff --git a/tools/perf/util/bpf_off_cpu.c b/tools/perf/util/bpf_off_cpu.c
index 88e0660c4bff..0891d9c73660 100644
--- a/tools/perf/util/bpf_off_cpu.c
+++ b/tools/perf/util/bpf_off_cpu.c
@@ -67,7 +67,7 @@ static void off_cpu_start(void *arg)
 	struct evlist *evlist = arg;
 	struct evsel *evsel;
 	struct perf_cpu pcpu;
-	int i;
+	unsigned int i;
 
 	/* update task filter for the given workload */
 	if (skel->rodata->has_task && skel->rodata->uses_tgid &&
diff --git a/tools/perf/util/bpf_trace_augment.c b/tools/perf/util/bpf_trace_augment.c
index 56ed17534caa..9e706f0fa53d 100644
--- a/tools/perf/util/bpf_trace_augment.c
+++ b/tools/perf/util/bpf_trace_augment.c
@@ -60,7 +60,7 @@ int augmented_syscalls__create_bpf_output(struct evlist *evlist)
 void augmented_syscalls__setup_bpf_output(void)
 {
 	struct perf_cpu cpu;
-	int i;
+	unsigned int i;
 
 	if (bpf_output == NULL)
 		return;
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index a80845038a5e..11922e1ded84 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -254,7 +254,7 @@ struct cpu_aggr_map *cpu_aggr_map__new(const struct perf_cpu_map *cpus,
 				       aggr_cpu_id_get_t get_id,
 				       void *data, bool needs_sort)
 {
-	int idx;
+	unsigned int idx;
 	struct perf_cpu cpu;
 	struct cpu_aggr_map *c = cpu_aggr_map__empty_new(perf_cpu_map__nr(cpus));
 
@@ -280,7 +280,7 @@ struct cpu_aggr_map *cpu_aggr_map__new(const struct perf_cpu_map *cpus,
 		}
 	}
 	/* Trim. */
-	if (c->nr != perf_cpu_map__nr(cpus)) {
+	if (c->nr != (int)perf_cpu_map__nr(cpus)) {
 		struct cpu_aggr_map *trimmed_c =
 			realloc(c,
 				sizeof(struct cpu_aggr_map) + sizeof(struct aggr_cpu_id) * c->nr);
@@ -631,9 +631,9 @@ size_t cpu_map__snprint(struct perf_cpu_map *map, char *buf, size_t size)
 
 #define COMMA first ? "" : ","
 
-	for (i = 0; i < perf_cpu_map__nr(map) + 1; i++) {
+	for (i = 0; i < (int)perf_cpu_map__nr(map) + 1; i++) {
 		struct perf_cpu cpu = { .cpu = INT16_MAX };
-		bool last = i == perf_cpu_map__nr(map);
+		bool last = i == (int)perf_cpu_map__nr(map);
 
 		if (!last)
 			cpu = perf_cpu_map__cpu(map, i);
@@ -679,7 +679,7 @@ static char hex_char(unsigned char val)
 
 size_t cpu_map__snprint_mask(struct perf_cpu_map *map, char *buf, size_t size)
 {
-	int idx;
+	unsigned int idx;
 	char *ptr = buf;
 	unsigned char *bitmap;
 	struct perf_cpu c, last_cpu = perf_cpu_map__max(map);
diff --git a/tools/perf/util/cputopo.c b/tools/perf/util/cputopo.c
index 8bbeb2dc76fd..e0091804fe98 100644
--- a/tools/perf/util/cputopo.c
+++ b/tools/perf/util/cputopo.c
@@ -191,7 +191,7 @@ bool cpu_topology__core_wide(const struct cpu_topology *topology,
 		const char *core_cpu_list = topology->core_cpus_list[i];
 		struct perf_cpu_map *core_cpus = perf_cpu_map__new(core_cpu_list);
 		struct perf_cpu cpu;
-		int idx;
+		unsigned int idx;
 		bool has_first, first = true;
 
 		perf_cpu_map__for_each_cpu(cpu, idx, core_cpus) {
diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index 93d475a80f14..1e54e2c86360 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -718,7 +718,7 @@ int perf_env__numa_node(struct perf_env *env, struct perf_cpu cpu)
 
 		for (i = 0; i < env->nr_numa_nodes; i++) {
 			struct perf_cpu tmp;
-			int j;
+			unsigned int j;
 
 			nn = &env->numa_nodes[i];
 			perf_cpu_map__for_each_cpu(tmp, j, nn->map)
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index 2b0df7bd9a46..5a30caaec73e 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -1701,7 +1701,7 @@ static void python_process_stat(struct perf_stat_config *config,
 	struct perf_cpu_map *cpus = counter->core.cpus;
 
 	for (int thread = 0; thread < perf_thread_map__nr(threads); thread++) {
-		int idx;
+		unsigned int idx;
 		struct perf_cpu cpu;
 
 		perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 4b465abfa36c..09de5288f9e1 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -2766,7 +2766,8 @@ struct evsel *perf_session__find_first_evtype(struct perf_session *session,
 int perf_session__cpu_bitmap(struct perf_session *session,
 			     const char *cpu_list, unsigned long *cpu_bitmap)
 {
-	int i, err = -1;
+	unsigned int i;
+	int err = -1;
 	struct perf_cpu_map *map;
 	int nr_cpus = min(perf_session__env(session)->nr_cpus_avail, MAX_NR_CPUS);
 	struct perf_cpu cpu;
diff --git a/tools/perf/util/stat-display.c b/tools/perf/util/stat-display.c
index dc2b66855f6c..993f4c4b8f44 100644
--- a/tools/perf/util/stat-display.c
+++ b/tools/perf/util/stat-display.c
@@ -897,7 +897,7 @@ static bool should_skip_zero_counter(struct perf_stat_config *config,
 				     const struct aggr_cpu_id *id)
 {
 	struct perf_cpu cpu;
-	int idx;
+	unsigned int idx;
 
 	/*
 	 * Skip unsupported default events when not verbose. (default events
@@ -1125,7 +1125,7 @@ static void print_no_aggr_metric(struct perf_stat_config *config,
 				 struct evlist *evlist,
 				 struct outstate *os)
 {
-	int all_idx;
+	unsigned int all_idx;
 	struct perf_cpu cpu;
 
 	perf_cpu_map__for_each_cpu(cpu, all_idx, evlist->core.user_requested_cpus) {
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index 976a06e63252..14d169e22e8f 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -246,9 +246,11 @@ void evlist__reset_prev_raw_counts(struct evlist *evlist)
 
 static void evsel__copy_prev_raw_counts(struct evsel *evsel)
 {
-	int idx, nthreads = perf_thread_map__nr(evsel->core.threads);
+	int nthreads = perf_thread_map__nr(evsel->core.threads);
 
 	for (int thread = 0; thread < nthreads; thread++) {
+		unsigned int idx;
+
 		perf_cpu_map__for_each_idx(idx, evsel__cpus(evsel)) {
 			*perf_counts(evsel->counts, idx, thread) =
 				*perf_counts(evsel->prev_raw_counts, idx, thread);
@@ -580,7 +582,7 @@ static void evsel__update_percore_stats(struct evsel *evsel, struct aggr_cpu_id
 	struct perf_counts_values counts = { 0, };
 	struct aggr_cpu_id id;
 	struct perf_cpu cpu;
-	int idx;
+	unsigned int idx;
 
 	/* collect per-core counts */
 	perf_cpu_map__for_each_cpu(cpu, idx, evsel->core.cpus) {
@@ -617,7 +619,7 @@ static void evsel__process_percore(struct evsel *evsel)
 	struct perf_stat_evsel *ps = evsel->stats;
 	struct aggr_cpu_id core_id;
 	struct perf_cpu cpu;
-	int idx;
+	unsigned int idx;
 
 	if (!evsel->percore)
 		return;
diff --git a/tools/perf/util/svghelper.c b/tools/perf/util/svghelper.c
index b1d259f590e9..e360e7736c7b 100644
--- a/tools/perf/util/svghelper.c
+++ b/tools/perf/util/svghelper.c
@@ -726,7 +726,8 @@ static void scan_core_topology(int *map, struct topology *t, int nr_cpus)
 
 static int str_to_bitmap(char *s, cpumask_t *b, int nr_cpus)
 {
-	int idx, ret = 0;
+	unsigned int idx;
+	int ret = 0;
 	struct perf_cpu_map *map;
 	struct perf_cpu cpu;
 
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index ce9195717f44..b4b30675688d 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -2363,7 +2363,8 @@ static int setup_parallelism_bitmap(void)
 {
 	struct perf_cpu_map *map;
 	struct perf_cpu cpu;
-	int i, err = -1;
+	unsigned int i;
+	int err = -1;
 
 	if (symbol_conf.parallelism_list_str == NULL)
 		return 0;
diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c
index ddf1cbda1902..85bee747f4cd 100644
--- a/tools/perf/util/synthetic-events.c
+++ b/tools/perf/util/synthetic-events.c
@@ -1266,7 +1266,7 @@ static void synthesize_cpus(struct synthesize_cpu_map_data *data)
 
 static void synthesize_mask(struct synthesize_cpu_map_data *data)
 {
-	int idx;
+	unsigned int idx;
 	struct perf_cpu cpu;
 
 	/* Due to padding, the 4bytes per entry mask variant is always smaller. */

From 9b6c479c5f418e6174f528f0b25d944f74172c61 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 31 Mar 2026 11:05:20 -0700
Subject: [PATCH 062/131] perf tests: Write test files to tmpdir

Writing to the test output files in the current working directory can
fail in various contexts such as continual test. Other tests write to
a mktemp-ed file, make the "perf script task-analyszer tests" follow
this convention too. Currently this isn't possible for the perf.data
file due to a lack of perf script support, add a variable for when
this support is available.

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/tests/shell/test_task_analyzer.sh | 42 +++++++++++---------
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/tools/perf/tests/shell/test_task_analyzer.sh b/tools/perf/tests/shell/test_task_analyzer.sh
index e194fcf61df3..0314412e63b4 100755
--- a/tools/perf/tests/shell/test_task_analyzer.sh
+++ b/tools/perf/tests/shell/test_task_analyzer.sh
@@ -3,6 +3,11 @@
 # SPDX-License-Identifier: GPL-2.0
 
 tmpdir=$(mktemp -d /tmp/perf-script-task-analyzer-XXXXX)
+# TODO: perf script report only supports input from the CWD perf.data file, make
+# it support input from any file.
+perfdata="perf.data"
+csv="$tmpdir/csv"
+csvsummary="$tmpdir/csvsummary"
 err=0
 
 # set PERF_EXEC_PATH to find scripts in the source directory
@@ -15,11 +20,10 @@ fi
 export ASAN_OPTIONS=detect_leaks=0
 
 cleanup() {
-  rm -f perf.data
-  rm -f perf.data.old
-  rm -f csv
-  rm -f csvsummary
+  rm -f "${perfdata}"
+  rm -f "${perfdata}".old
   rm -rf "$tmpdir"
+
   trap - exit term int
 }
 
@@ -61,10 +65,10 @@ skip_no_probe_record_support() {
 
 prepare_perf_data() {
 	# 1s should be sufficient to catch at least some switches
-	perf record -e sched:sched_switch -a -- sleep 1 > /dev/null 2>&1
+	perf record -e sched:sched_switch -a -o "${perfdata}" -- sleep 1 > /dev/null 2>&1
 	# check if perf data file got created in above step.
-	if [ ! -e "perf.data" ]; then
-		printf "FAIL: perf record failed to create \"perf.data\" \n"
+	if [ ! -e "${perfdata}" ]; then
+		printf "FAIL: perf record failed to create \"${perfdata}\" \n"
 		return 1
 	fi
 }
@@ -130,28 +134,28 @@ test_extended_times_summary_ns() {
 }
 
 test_csv() {
-	perf script report task-analyzer --csv csv > /dev/null
-	check_exec_0 "perf script report task-analyzer --csv csv"
-	find_str_or_fail "Comm;" csv "${FUNCNAME[0]}"
+	perf script report task-analyzer --csv "${csv}" > /dev/null
+	check_exec_0 "perf script report task-analyzer --csv ${csv}"
+	find_str_or_fail "Comm;" "${csv}" "${FUNCNAME[0]}"
 }
 
 test_csv_extended_times() {
-	perf script report task-analyzer --csv csv --extended-times > /dev/null
-	check_exec_0 "perf script report task-analyzer --csv csv --extended-times"
-	find_str_or_fail "Out-Out;" csv "${FUNCNAME[0]}"
+	perf script report task-analyzer --csv "${csv}" --extended-times > /dev/null
+	check_exec_0 "perf script report task-analyzer --csv ${csv} --extended-times"
+	find_str_or_fail "Out-Out;" "${csv}" "${FUNCNAME[0]}"
 }
 
 test_csvsummary() {
-	perf script report task-analyzer --csv-summary csvsummary > /dev/null
-	check_exec_0 "perf script report task-analyzer --csv-summary csvsummary"
-	find_str_or_fail "Comm;" csvsummary "${FUNCNAME[0]}"
+	perf script report task-analyzer --csv-summary "${csvsummary}" > /dev/null
+	check_exec_0 "perf script report task-analyzer --csv-summary ${csvsummary}"
+	find_str_or_fail "Comm;" "${csvsummary}" "${FUNCNAME[0]}"
 }
 
 test_csvsummary_extended() {
-	perf script report task-analyzer --csv-summary csvsummary --summary-extended \
+	perf script report task-analyzer --csv-summary "${csvsummary}" --summary-extended \
 	>/dev/null
-	check_exec_0 "perf script report task-analyzer --csv-summary csvsummary --summary-extended"
-	find_str_or_fail "Out-Out;" csvsummary "${FUNCNAME[0]}"
+	check_exec_0 "perf script report task-analyzer --csv-summary ${csvsummary} --summary-extended"
+	find_str_or_fail "Out-Out;" "${csvsummary}" "${FUNCNAME[0]}"
 }
 
 skip_no_probe_record_support

From d9db9c8db56c3e378aa5c91637664f77ca5a6f72 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 18 Mar 2026 23:45:13 -0700
Subject: [PATCH 063/131] perf test: Fix perf stat --bpf-counters on hybrid
 machines

The test constantly fails on my Intel hybrid machine.  The issue was it
has two events in the output even if I only gave it one event.

  $ perf stat -e instructions -- perf test -w sqrtloop

   Performance counter stats for 'perf test -w sqrtloop':

         910,856,421      cpu_atom/instructions/                (28.05%)
      14,852,865,997      cpu_core/instructions/                (96.79%)

         1.014313341 seconds time elapsed

         1.004114000 seconds user
         0.008174000 seconds sys

Let's modify the awk script to add the values for each line and print
the total.  The variable 'i' has a number of input lines that have valid
output and variable 'c' has the sum of actual counter values.  That way
it should work on any platforms.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/tests/shell/stat_bpf_counters.sh | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/tools/perf/tests/shell/stat_bpf_counters.sh b/tools/perf/tests/shell/stat_bpf_counters.sh
index f43e28a136d3..35463358b273 100755
--- a/tools/perf/tests/shell/stat_bpf_counters.sh
+++ b/tools/perf/tests/shell/stat_bpf_counters.sh
@@ -41,8 +41,14 @@ check_counts()
 test_bpf_counters()
 {
 	printf "Testing --bpf-counters "
-	base_instructions=$(perf stat --no-big-num -e instructions -- $workload 2>&1 | awk '/instructions/ {print $1}')
-	bpf_instructions=$(perf stat --no-big-num --bpf-counters -e instructions -- $workload  2>&1 | awk '/instructions/ {print $1}')
+	base_instructions=$(perf stat --no-big-num -e instructions -- $workload 2>&1 | \
+				awk -v i=0 -v c=0 '/instructions/ { \
+					if ($1 != "<not") { i++; c += $1 } \
+				} END { if (i > 0) printf "%.0f", c; else print "<not" }')
+	bpf_instructions=$(perf stat --no-big-num --bpf-counters -e instructions -- $workload  2>&1 | \
+				awk -v i=0 -v c=0 '/instructions/ { \
+					if ($1 != "<not") { i++; c += $1 } \
+				} END { if (i > 0) printf "%.0f", c; else print "<not" }')
 	check_counts $base_instructions $bpf_instructions
 	compare_number $base_instructions $bpf_instructions
 	echo "[Success]"
@@ -52,8 +58,14 @@ test_bpf_modifier()
 {
 	printf "Testing bpf event modifier "
 	stat_output=$(perf stat --no-big-num -e instructions/name=base_instructions/,instructions/name=bpf_instructions/b -- $workload 2>&1)
-	base_instructions=$(echo "$stat_output"| awk '/base_instructions/ {print $1}')
-	bpf_instructions=$(echo "$stat_output"| awk '/bpf_instructions/ {print $1}')
+	base_instructions=$(echo "$stat_output"| \
+				awk -v i=0 -v c=0 '/base_instructions/ { \
+					if ($1 != "<not") { i++; c += $1 } \
+				} END { if (i > 0) printf "%.0f", c; else print "<not" }')
+	bpf_instructions=$(echo "$stat_output"| \
+				awk -v i=0 -v c=0 '/bpf_instructions/ { \
+					if ($1 != "<not") { i++; c += $1 } \
+				} END { if (i > 0) printf "%.0f", c; else print "<not" }')
 	check_counts $base_instructions $bpf_instructions
 	compare_number $base_instructions $bpf_instructions
 	echo "[Success]"

From b5708a308a5602d4a3caf0720dce452082d443ec Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Wed, 25 Mar 2026 03:24:30 -0700
Subject: [PATCH 064/131] perf stat: Fix crash on arm64

Perf stat is crashing on arm64 hosts with the following issue:

  # make -C tools/perf DEBUG=1
  # perf stat sleep 1
  perf: util/evsel.c:2034: get_group_fd: Assertion `!(!leader->core.fd)' failed.
  [1]    1220794 IOT instruction (core dumped)  ./perf stat

The sorting function introduced by commit a745c0831c15c ("perf stat:
Sort default events/metrics") compares events based on their individual
properties. This can cause events from different groups to be
interleaved, resulting in group members appearing before their leaders
in the sorted evlist.

When the iterator opens events in list order, a group member may be
processed before its leader has been opened.

For example, CPU_CYCLES (idx=32) with leader STALL_SLOT_BACKEND (idx=37)
could be sorted before its leader, causing the crash when CPU_CYCLES
tries to get its group fd from the not-yet-opened leader.

Fix this by comparing events based on their leader's attributes instead
of their own attributes when the events are in different groups. This
ensures all members of a group share the same sort key as their leader,
keeping groups together and guaranteeing leaders are opened before their
members.

Fixes: a745c0831c15c ("perf stat: Sort default events/metrics")
Reported-by: Denis Yaroshevskiy <dyaroshev@meta.com>
Tested-by: Dmitry Ilvokhin <d@ilvokhin.com>
Tested-by: Ian Rogers <irogers@google.com>
Signed-off-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/builtin-stat.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index a24326c44297..35934e8bbd51 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1932,25 +1932,33 @@ static int default_evlist_evsel_cmp(void *priv __maybe_unused,
 	const struct evsel *lhs = container_of(lhs_core, struct evsel, core);
 	const struct perf_evsel *rhs_core = container_of(r, struct perf_evsel, node);
 	const struct evsel *rhs = container_of(rhs_core, struct evsel, core);
+	const struct evsel *lhs_leader = evsel__leader(lhs);
+	const struct evsel *rhs_leader = evsel__leader(rhs);
 
-	if (evsel__leader(lhs) == evsel__leader(rhs)) {
+	if (lhs_leader == rhs_leader) {
 		/* Within the same group, respect the original order. */
 		return lhs_core->idx - rhs_core->idx;
 	}
 
-	/* Sort default metrics evsels first, and default show events before those. */
-	if (lhs->default_metricgroup != rhs->default_metricgroup)
-		return lhs->default_metricgroup ? -1 : 1;
+	/*
+	 * Compare using leader's attributes so that all members of a group
+	 * stay together. This ensures leaders are opened before their members.
+	 */
 
-	if (lhs->default_show_events != rhs->default_show_events)
-		return lhs->default_show_events ? -1 : 1;
+	/* Sort default metrics evsels first, and default show events before those. */
+	if (lhs_leader->default_metricgroup != rhs_leader->default_metricgroup)
+		return lhs_leader->default_metricgroup ? -1 : 1;
+
+	if (lhs_leader->default_show_events != rhs_leader->default_show_events)
+		return lhs_leader->default_show_events ? -1 : 1;
 
 	/* Sort by PMU type (prefers legacy types first). */
-	if (lhs->pmu != rhs->pmu)
-		return lhs->pmu->type - rhs->pmu->type;
+	if (lhs_leader->pmu != rhs_leader->pmu)
+		return lhs_leader->pmu->type - rhs_leader->pmu->type;
 
-	/* Sort by name. */
-	return strcmp(evsel__name((struct evsel *)lhs), evsel__name((struct evsel *)rhs));
+	/* Sort by leader's name. */
+	return strcmp(evsel__name((struct evsel *)lhs_leader),
+		      evsel__name((struct evsel *)rhs_leader));
 }
 
 /*

From 4cbceeca56386256dbb5d1ce657c81ba03275ee0 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 1 Apr 2026 16:05:09 -0700
Subject: [PATCH 065/131] perf trace: Skip unnecessary synthesis for
 summary-only mode

It needs to synthesize task info for the comm name.  The mmap
information is only needed for callchain symbolization which is not used
by the summary mode.  Also total or cgroup summary mode don't require
the task info.  Let's skip the processing if possible.

Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/builtin-trace.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index f487fbaa0ad6..d121640ace6e 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -2004,9 +2004,13 @@ static int trace__symbols_init(struct trace *trace, int argc, const char **argv,
 	if (err < 0)
 		goto out;
 
+	if (trace->summary_only && trace->summary_mode != SUMMARY__BY_THREAD)
+		goto out;
+
 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
 					    evlist->core.threads, trace__tool_process,
-					    /*needs_mmap=*/callchain_param.enabled,
+					    /*needs_mmap=*/callchain_param.enabled &&
+							   !trace->summary_only,
 					    /*mmap_data=*/false,
 					    /*nr_threads_synthesize=*/1);
 out:

From 9a82bfde4775b7a87cd1a7e791f46f83ae442848 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 23 Mar 2026 11:58:04 -0400
Subject: [PATCH 066/131] perf tools: Fix module symbol resolution for non-zero
 .text sh_addr

When perf resolves symbols from kernel module ELF files (ET_REL),
it converts symbol addresses to file offsets so that sample IPs
can be matched to the correct symbol. The conversion adjusts each
symbol's st_value:

  sym->st_value -= shdr->sh_addr - shdr->sh_offset;

For vmlinux (ET_EXEC), st_value is a virtual address and sh_addr
is the section's virtual base, so subtracting sh_addr and adding
sh_offset correctly yields a file offset.

For kernel modules (ET_REL), st_value is a section-relative
offset. The module loader ignores sh_addr entirely and places
symbols at module_base + st_value. Converting to file offset
requires only adding sh_offset; subtracting sh_addr introduces an
error equal to sh_addr bytes.

When .text has sh_addr == 0 -- the historical norm for simple
modules -- both formulas produce the same result and the bug is
latent. As modules gain more metadata sections before .text (.note,
.static_call.text, etc.), the linker assigns .text a non-zero
sh_addr, exposing the defect. For example, nfsd.ko on this kernel
has sh_addr=0xa80, kvm-intel.ko has sh_addr=0x1e90.

The effect is that all .text symbols in affected modules
shift by sh_addr bytes relative to sample IPs, causing perf
report to attribute samples to incorrect, nearby symbols. This
was observed as 13% of LLC-load-miss samples misattributed
to nfsd_file_get_dio_attrs when the actual hot function was
nfsd_cache_lookup, approximately 0xa80 bytes away in the symbol
table.

Use the existing dso__rel() flag (already set for ET_REL modules)
to select the correct adjustment: add sh_offset for ET_REL,
subtract (sh_addr - sh_offset) for ET_EXEC/ET_DYN.

Fixes: 0131c4ec794a ("perf tools: Make it possible to read object code from kernel modules")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Richter <tmricht@linux.ibm.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/symbol-elf.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index 3cd4e5a03cc5..7afa8a117139 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -1354,8 +1354,12 @@ static int dso__process_kernel_symbol(struct dso *dso, struct map *map,
 	char dso_name[PATH_MAX];
 
 	/* Adjust symbol to map to file offset */
-	if (adjust_kernel_syms)
-		sym->st_value -= shdr->sh_addr - shdr->sh_offset;
+	if (adjust_kernel_syms) {
+		if (dso__rel(dso))
+			sym->st_value += shdr->sh_offset;
+		else
+			sym->st_value -= shdr->sh_addr - shdr->sh_offset;
+	}
 
 	if (strcmp(section_name, (dso__short_name(curr_dso) + dso__short_name_len(dso))) == 0)
 		return 0;

From 77cb9b443b7fff2a93d78cd2e309db030046772f Mon Sep 17 00:00:00 2001
From: Thomas Falcon <thomas.falcon@intel.com>
Date: Thu, 26 Mar 2026 20:59:27 -0500
Subject: [PATCH 067/131] perf test: Fix ratio_to_prev event parsing test

test__ratio_to_prev() assumed the first event in a group is the leader,
which is not the case when the event is expanded into two event groups
on hybrid PMU's with auto counter reload support. Instead, iterate over the
event group generated for each core PMU. Also update "wrong leader" test to
check that the subordinate event has the correct leader instead of checking
that it is not the group leader. Finally, do not exit immediately if a PMU
without auto counter reload support is found.

Signed-off-by: Thomas Falcon <thomas.falcon@intel.com>
Reviewed-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Fixes: 56be0fe5f62c ("perf record: Add auto counter reload parse and regression tests")
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/tests/parse-events.c | 49 +++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 21 deletions(-)

diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index 1d3cc224fbc2..05c3e899b425 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -1796,31 +1796,38 @@ static bool test__acr_valid(void)
 
 static int test__ratio_to_prev(struct evlist *evlist)
 {
-	struct evsel *evsel;
+	struct evsel *evsel, *leader;
 
 	TEST_ASSERT_VAL("wrong number of entries", 2 * perf_pmus__num_core_pmus() == evlist->core.nr_entries);
 
-	 evlist__for_each_entry(evlist, evsel) {
-		if (!perf_pmu__has_format(evsel->pmu, "acr_mask"))
-			return TEST_OK;
-
-		if (evsel == evlist__first(evlist)) {
-			TEST_ASSERT_VAL("wrong config2", 0 == evsel->core.attr.config2);
-			TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel));
-			TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2);
-			TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0);
-			TEST_ASSERT_EVSEL("unexpected event",
-					evsel__match(evsel, HARDWARE, HW_CPU_CYCLES),
-					evsel);
-		} else {
-			TEST_ASSERT_VAL("wrong config2", 0 == evsel->core.attr.config2);
-			TEST_ASSERT_VAL("wrong leader", !evsel__is_group_leader(evsel));
-			TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 0);
-			TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1);
-			TEST_ASSERT_EVSEL("unexpected event",
-					evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS),
-					evsel);
+	evlist__for_each_entry(evlist, evsel) {
+		if (evsel != evsel__leader(evsel) ||
+		    !perf_pmu__has_format(evsel->pmu, "acr_mask")) {
+			continue;
 		}
+		leader = evsel;
+		/* cycles */
+		TEST_ASSERT_VAL("wrong config2", 0 == leader->core.attr.config2);
+		TEST_ASSERT_VAL("wrong core.nr_members", leader->core.nr_members == 2);
+		TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(leader) == 0);
+		TEST_ASSERT_EVSEL("unexpected event",
+				  evsel__match(leader, HARDWARE, HW_CPU_CYCLES),
+				  leader);
+		/*
+		 * The period value gets configured within evlist__config,
+		 * while this test executes only parse events method.
+		 */
+		TEST_ASSERT_VAL("wrong period", 0 == leader->core.attr.sample_period);
+
+		 /* instructions/period=200000,ratio-to-prev=2.0/ */
+		evsel = evsel__next(evsel);
+		TEST_ASSERT_VAL("wrong config2", 0 == evsel->core.attr.config2);
+		TEST_ASSERT_VAL("wrong leader", evsel__has_leader(evsel, leader));
+		TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 0);
+		TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1);
+		TEST_ASSERT_EVSEL("unexpected event",
+				  evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS),
+				  evsel);
 		/*
 		 * The period value gets configured within evlist__config,
 		 * while this test executes only parse events method.

From ff6be45adb1989698867938157f9317ae0bba936 Mon Sep 17 00:00:00 2001
From: Anubhav Shelat <ashelat@redhat.com>
Date: Wed, 1 Apr 2026 09:24:43 -0400
Subject: [PATCH 068/131] perf tools: prevent null dsos from being added

When sorting the dso array we sometimes get a crash due to null
comparisons in comparator functions. So prevent __dsos__add from
adding null to the dso array to avoid out-of-memory related errors.

Signed-off-by: Anubhav Shelat <ashelat@redhat.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/dsos.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/perf/util/dsos.c b/tools/perf/util/dsos.c
index 0a7645c7fae7..5cf8c878bab2 100644
--- a/tools/perf/util/dsos.c
+++ b/tools/perf/util/dsos.c
@@ -196,6 +196,9 @@ static struct dso *__dsos__find_by_longname_id(struct dsos *dsos,
 
 int __dsos__add(struct dsos *dsos, struct dso *dso)
 {
+	if (!dso)
+		return -EINVAL;
+
 	if (dsos->cnt == dsos->allocated) {
 		unsigned int to_allocate = 2;
 		struct dso **temp;

From eb27e1c885ea75c1661188a548d100c8bce5970a Mon Sep 17 00:00:00 2001
From: Thomas Richter <tmricht@linux.ibm.com>
Date: Wed, 1 Apr 2026 14:21:01 +0200
Subject: [PATCH 069/131] perf test: Skip perf data type profiling tests for
 s390

Test case 'perf data type profiling tests' fails on s390 with this
error:

  # ./perf mem record -- ./perf test -w code_with_type
  failed: no PMU supports the memory events
  # echo $?
  255
  #

because s390 does not support memory events at all. According to the
man page, perf annotate --code-with-type only works with memory
instructions only.  As command 'perf mem record ...' is not supported
on s390, skip this test for s390.

Output before:
 # ./perf test 'perf data type profiling tests'
 77: perf data type profiling tests                        : FAILED!

Output after:
 # ./perf test 'perf data type profiling tests'
 77: perf data type profiling tests                        : Skip

Fixes: f60a5c22967b8 ("perf tests: Test annotate with data type profiling and rust")
Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Cc: Dmitrii Dolgov <9erthalion6@gmail.com>
Suggested-by: Namhyung Kim <namhyung@kernel.org>
Suggested-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/tests/shell/data_type_profiling.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/perf/tests/shell/data_type_profiling.sh b/tools/perf/tests/shell/data_type_profiling.sh
index fb47b7213b33..eca694600a04 100755
--- a/tools/perf/tests/shell/data_type_profiling.sh
+++ b/tools/perf/tests/shell/data_type_profiling.sh
@@ -15,6 +15,10 @@ err=0
 perfdata=$(mktemp /tmp/__perf_test.perf.data.XXXXX)
 perfout=$(mktemp /tmp/__perf_test.perf.out.XXXXX)
 
+# Check for support of perf mem before trap handler
+perf mem record -o /dev/null -- true  2>&1 | \
+  		grep -q "failed: no PMU supports the memory events" && exit 2
+
 cleanup() {
   rm -rf "${perfdata}" "${perfout}"
   rm -rf "${perfdata}".old

From 3a61fd866ef9aaa1d3158b460f852b74a2df07f4 Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@arm.com>
Date: Thu, 2 Apr 2026 17:04:47 +0100
Subject: [PATCH 070/131] perf expr: Return -EINVAL for syntax error in
 expr__find_ids()

expr__find_ids() propagates the parser return value directly.  For syntax
errors, the parser can return a positive value, but callers treat it as
success, e.g., for below case on Arm64 platform:

  metric expr 100 * (STALL_SLOT_BACKEND / (CPU_CYCLES * #slots) - BR_MIS_PRED * 3 / CPU_CYCLES) for backend_bound
  parsing metric: 100 * (STALL_SLOT_BACKEND / (CPU_CYCLES * #slots) - BR_MIS_PRED * 3 / CPU_CYCLES)
  Failure to read '#slots' literal: #slots = nan
  syntax error

Convert positive parser returns in expr__find_ids() to -EINVAL, as a
result, the error value will be respected by callers.

Before:

  perf stat -C 5
  Failure to read '#slots'Failure to read '#slots'Failure to read '#slots'Failure to read '#slots'Segmentation fault

After:

  perf stat -C 5
  Failure to read '#slots'Cannot find metric or group `Default'

Fixes: ded80bda8bc9 ("perf expr: Migrate expr ids table to a hashmap")
Signed-off-by: Leo Yan <leo.yan@arm.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/expr.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/expr.c b/tools/perf/util/expr.c
index 465fe2e9bbbe..b7664cb68554 100644
--- a/tools/perf/util/expr.c
+++ b/tools/perf/util/expr.c
@@ -376,7 +376,8 @@ int expr__find_ids(const char *expr, const char *one,
 	if (one)
 		expr__del_id(ctx, one);
 
-	return ret;
+	/* A positive value means syntax error, convert to -EINVAL */
+	return ret > 0 ? -EINVAL : ret;
 }
 
 double expr_id_data__value(const struct expr_id_data *data)

From d148934beeacaf074e1e6f00fae3be737bbc4089 Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@arm.com>
Date: Thu, 2 Apr 2026 17:04:48 +0100
Subject: [PATCH 071/131] perf expr: Add '\n' in literal parse errors

Add a trailing newline for logs.

Before:

  perf stat -C 5
  Failure to read '#slots'Cannot find metric or group `Default'

After:

  perf stat -C 5
  Failure to read '#slots'
  Cannot find metric or group `Default'

Signed-off-by: Leo Yan <leo.yan@arm.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/expr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/expr.c b/tools/perf/util/expr.c
index b7664cb68554..644769e92708 100644
--- a/tools/perf/util/expr.c
+++ b/tools/perf/util/expr.c
@@ -407,9 +407,9 @@ double expr__get_literal(const char *literal, const struct expr_scanner_ctx *ctx
 					 &count))
 			result = count;
 		else
-			pr_err("Failure to read '%s'", literal);
+			pr_err("Failure to read '%s'\n", literal);
 	} else {
-		pr_err("Unrecognized literal '%s'", literal);
+		pr_err("Unrecognized literal '%s'\n", literal);
 	}
 
 	pr_debug2("literal: %s = %f\n", literal, result);

From e0f4767bf403131f7ec7378d0d23ad6c29b01936 Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@arm.com>
Date: Thu, 2 Apr 2026 17:04:49 +0100
Subject: [PATCH 072/131] perf metricgroup: Refine error logs

Return -ENOENT when no metric/group matches, and directly use the return
value from expr__find_ids(), so -EINVAL is reserved for parse failures.

Print separate logs to make it clear.

Before:

  perf stat -C 5 -vvv
  Using CPUID 0x00000000410fd490
  metric expr 100 * (STALL_SLOT_BACKEND / (CPU_CYCLES * #slots) - BR_MIS_PRED * 3 / CPU_CYCLES) for backend_bound
  parsing metric: 100 * (STALL_SLOT_BACKEND / (CPU_CYCLES * #slots) - BR_MIS_PRED * 3 / CPU_CYCLES)
  Failure to read '#slots'
  literal: #slots = nan
  syntax error
  Cannot find metric or group `Default'

After:

  perf stat -C 5 -vvv
  Using CPUID 0x00000000410fd490
  metric expr 100 * (STALL_SLOT_BACKEND / (CPU_CYCLES * #slots) - BR_MIS_PRED * 3 / CPU_CYCLES) for backend_bound
  parsing metric: 100 * (STALL_SLOT_BACKEND / (CPU_CYCLES * #slots) - BR_MIS_PRED * 3 / CPU_CYCLES)
  Failure to read '#slots'
  literal: #slots = nan
  syntax error
  Fail to parse metric or group `Default'

Signed-off-by: Leo Yan <leo.yan@arm.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/metricgroup.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c
index f7d53b4e46f4..4db9578efd81 100644
--- a/tools/perf/util/metricgroup.c
+++ b/tools/perf/util/metricgroup.c
@@ -914,10 +914,9 @@ static int __add_metric(struct list_head *metric_list,
 		expr = metric_no_threshold ? pm->metric_name : pm->metric_threshold;
 		visited_node.name = "__threshold__";
 	}
-	if (expr__find_ids(expr, NULL, root_metric->pctx) < 0) {
-		/* Broken metric. */
-		ret = -EINVAL;
-	}
+
+	ret = expr__find_ids(expr, NULL, root_metric->pctx);
+
 	if (!ret) {
 		/* Resolve referenced metrics. */
 		struct perf_pmu *pmu;
@@ -1101,7 +1100,7 @@ static int metricgroup__add_metric(const char *pmu, const char *metric_name, con
 	 */
 	ret = metricgroup__for_each_metric(table, metricgroup__add_metric_callback, &data);
 	if (!ret && !data.has_match)
-		ret = -EINVAL;
+		ret = -ENOENT;
 
 	/*
 	 * add to metric_list so that they can be released
@@ -1152,6 +1151,8 @@ static int metricgroup__add_metric_list(const char *pmu, const char *list,
 					      user_requested_cpu_list,
 					      system_wide, metric_list, table);
 		if (ret == -EINVAL)
+			pr_err("Fail to parse metric or group `%s'\n", metric_name);
+		else if (ret == -ENOENT)
 			pr_err("Cannot find metric or group `%s'\n", metric_name);
 
 		if (ret)

From 85a9a4abcdc09ee941273c99d3ad0bc2ddef09ea Mon Sep 17 00:00:00 2001
From: SeungJu Cheon <suunj1331@gmail.com>
Date: Fri, 3 Apr 2026 01:04:10 +0900
Subject: [PATCH 073/131] perf header: Validate build_id filename length to
 prevent buffer overflow

The build_id parsing functions calculate a filename length from the
event header size and read directly into a stack buffer of PATH_MAX
bytes without bounds checking. A malformed perf.data file with a
crafted header.size can cause the length to be negative or exceed
PATH_MAX, resulting in a stack buffer overflow.

Add bounds checking for the filename length in both
perf_header__read_build_ids() and the ABI quirk variant. Print a
warning message when invalid length is detected.

Signed-off-by: SeungJu Cheon <suunj1331@gmail.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/header.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 9142a8ba4019..9ffc0f4ca6d1 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -2545,6 +2545,11 @@ static int perf_header__read_build_ids_abi_quirk(struct perf_header *header,
 			perf_event_header__bswap(&old_bev.header);
 
 		len = old_bev.header.size - sizeof(old_bev);
+		if (len < 0 || len >= PATH_MAX) {
+			pr_warning("invalid build_id filename length %zd\n", len);
+			return -1;
+		}
+
 		if (readn(input, filename, len) != len)
 			return -1;
 
@@ -2587,6 +2592,11 @@ static int perf_header__read_build_ids(struct perf_header *header,
 			perf_event_header__bswap(&bev.header);
 
 		len = bev.header.size - sizeof(bev);
+		if (len < 0 || len >= PATH_MAX) {
+			pr_warning("invalid build_id filename length %zd\n", len);
+			goto out;
+		}
+
 		if (readn(input, filename, len) != len)
 			goto out;
 		/*

From 11e8d234d4be7af401e8a24e078005ecd9bc1d1a Mon Sep 17 00:00:00 2001
From: Michael Petlan <mpetlan@redhat.com>
Date: Thu, 2 Apr 2026 16:51:18 +0200
Subject: [PATCH 074/131] perf trace: Fix potential u64 underflow in duration
 calculation

Although it happens very rarely, in case of out-of-order events (i.e.
due to CPU migration when a syscall is executed), the calculation of
event duration might underflow and thus a bogus value is printed:

    2.804 ( 0.001 ms): :49553/49553 rt_sigaction(sig: QUIT, act: 0x7fff403ed6e0, oact: 0x7fff403ed780, sigsetsize: 8) = 0
    2.807 ( 0.001 ms): :49553/49553 rt_sigaction(sig: CHLD, act: 0x7fff403ed6e0, oact: 0x7fff403ed780, sigsetsize: 8) = 0
    2.815 (18446744073709.438 ms): :49553/49553 execve(filename: 0xbb173a30, argv: 0x55aabb171930, envp: 0x55aabb171120) = 0
    2.815 ( 0.534 ms): pwd/49553  ... [continued]: execve())                                           = 0

Check for possible underflow first and in case of a bogus value, do
not print it.

    2.804 ( 0.001 ms): :49553/49553 rt_sigaction(sig: QUIT, act: 0x7fff403ed6e0, oact: 0x7fff403ed780, sigsetsize: 8) = 0
    2.807 ( 0.001 ms): :49553/49553 rt_sigaction(sig: CHLD, act: 0x7fff403ed6e0, oact: 0x7fff403ed780, sigsetsize: 8) = 0
    2.815 (         ): :49553/49553 execve(filename: 0xbb173a30, argv: 0x55aabb171930, envp: 0x55aabb171120) = 0
    2.815 ( 0.534 ms): pwd/49553  ... [continued]: execve())                                           = 0

Signed-off-by: Michael Petlan <mpetlan@redhat.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/builtin-trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index d121640ace6e..873d144807e2 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -2960,7 +2960,7 @@ static int trace__sys_exit(struct trace *trace, struct evsel *evsel,
 		++trace->stats.vfs_getname;
 	}
 
-	if (ttrace->entry_time) {
+	if (ttrace->entry_time && sample->time >= ttrace->entry_time) {
 		duration = sample->time - ttrace->entry_time;
 		if (trace__filter_duration(trace, duration))
 			goto out;

From 623030fd0ad59ecc4197b0c0f8dd066a0f0598b3 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Wed, 1 Apr 2026 09:13:17 -0700
Subject: [PATCH 075/131] perf clockid: Add missing include

clockid_t is declared in time.h but the include is missing. Reordering
header files may result in build breakages. Add the include to avoid
this.

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/clockid.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/clockid.h b/tools/perf/util/clockid.h
index 9b49b4711c76..33dbd8673c1c 100644
--- a/tools/perf/util/clockid.h
+++ b/tools/perf/util/clockid.h
@@ -1,8 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-
 #ifndef __PERF_CLOCKID_H
 #define __PERF_CLOCKID_H
 
+#include <time.h>
+
 struct option;
 int parse_clockid(const struct option *opt, const char *str, int unset);
 

From 8cc518735beb879c51df712a5ce5893c02f81b12 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Wed, 1 Apr 2026 09:13:18 -0700
Subject: [PATCH 076/131] perf header: Add utility to convert feature number to
 a string

For logging and debug messages it can be convenient to convert a
feature number to a name. Add header_feat__name for this and reuse the
data already within the feat_ops struct.

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/header.c | 7 +++++++
 tools/perf/util/header.h | 2 ++
 2 files changed, 9 insertions(+)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 9ffc0f4ca6d1..34178ce826fb 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -3780,6 +3780,13 @@ struct header_print_data {
 	bool full; /* extended list of headers */
 };
 
+const char *header_feat__name(unsigned int id)
+{
+	if (id < HEADER_LAST_FEATURE)
+		return feat_ops[id].name ?: "INVALID";
+	return "INVALID";
+}
+
 static int perf_file_section__fprintf_info(struct perf_file_section *section,
 					   struct perf_header *ph,
 					   int feat, int fd, void *data)
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index cc40ac796f52..ca22030a1434 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -132,6 +132,8 @@ struct perf_header_feature_ops {
 
 extern const char perf_version_string[];
 
+const char *header_feat__name(unsigned int id);
+
 int perf_session__read_header(struct perf_session *session);
 int perf_session__write_header(struct perf_session *session,
 			       struct evlist *evlist,

From 598de368375ed4ffaa23086524ea7dbb5b7fd256 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Wed, 1 Apr 2026 09:13:19 -0700
Subject: [PATCH 077/131] perf header: Properly warn/print when
 libtraceevent/libbpf support is missing

By removing the features from feat_ops with ifdefs the previous logic
would print "# (null)" when perf processed a feature that lacked
builtin support. Remove the ifdefs from feat_ops and in the relevant
functions print errors/messages about the lack of support.

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/header.c | 70 +++++++++++++++++++++++++++-------------
 1 file changed, 47 insertions(+), 23 deletions(-)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 34178ce826fb..9f1fe35a6b8a 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -306,16 +306,19 @@ static int do_read_bitmap(struct feat_fd *ff, unsigned long **pset, u64 *psize)
 	return 0;
 }
 
-#ifdef HAVE_LIBTRACEEVENT
 static int write_tracing_data(struct feat_fd *ff,
-			      struct evlist *evlist)
+			      struct evlist *evlist __maybe_unused)
 {
 	if (WARN(ff->buf, "Error: calling %s in pipe-mode.\n", __func__))
 		return -1;
 
+#ifdef HAVE_LIBTRACEEVENT
 	return read_tracing_data(ff->fd, &evlist->core.entries);
-}
+#else
+	pr_err("ERROR: Trying to write tracing data without libtraceevent support.\n");
+	return -1;
 #endif
+}
 
 static int write_build_id(struct feat_fd *ff,
 			  struct evlist *evlist __maybe_unused)
@@ -1026,10 +1029,10 @@ static int write_dir_format(struct feat_fd *ff,
 	return do_write(ff, &data->dir.version, sizeof(data->dir.version));
 }
 
-#ifdef HAVE_LIBBPF_SUPPORT
-static int write_bpf_prog_info(struct feat_fd *ff,
+static int write_bpf_prog_info(struct feat_fd *ff  __maybe_unused,
 			       struct evlist *evlist __maybe_unused)
 {
+#ifdef HAVE_LIBBPF_SUPPORT
 	struct perf_env *env = &ff->ph->env;
 	struct rb_root *root;
 	struct rb_node *next;
@@ -1067,11 +1070,16 @@ static int write_bpf_prog_info(struct feat_fd *ff,
 out:
 	up_read(&env->bpf_progs.lock);
 	return ret;
+#else
+	pr_err("ERROR: Trying to write bpf_prog_info without libbpf support.\n");
+	return -1;
+#endif // HAVE_LIBBPF_SUPPORT
 }
 
-static int write_bpf_btf(struct feat_fd *ff,
+static int write_bpf_btf(struct feat_fd *ff __maybe_unused,
 			 struct evlist *evlist __maybe_unused)
 {
+#ifdef HAVE_LIBBPF_SUPPORT
 	struct perf_env *env = &ff->ph->env;
 	struct rb_root *root;
 	struct rb_node *next;
@@ -1100,8 +1108,11 @@ static int write_bpf_btf(struct feat_fd *ff,
 out:
 	up_read(&env->bpf_progs.lock);
 	return ret;
-}
+#else
+	pr_err("ERROR: Trying to write btf data without libbpf support.\n");
+	return -1;
 #endif // HAVE_LIBBPF_SUPPORT
+}
 
 static int cpu_cache_level__sort(const void *a, const void *b)
 {
@@ -1980,9 +1991,9 @@ static void print_dir_format(struct feat_fd *ff, FILE *fp)
 	fprintf(fp, "# directory data version : %"PRIu64"\n", data->dir.version);
 }
 
-#ifdef HAVE_LIBBPF_SUPPORT
-static void print_bpf_prog_info(struct feat_fd *ff, FILE *fp)
+static void print_bpf_prog_info(struct feat_fd *ff __maybe_unused, FILE *fp)
 {
+#ifdef HAVE_LIBBPF_SUPPORT
 	struct perf_env *env = &ff->ph->env;
 	struct rb_root *root;
 	struct rb_node *next;
@@ -1993,7 +2004,7 @@ static void print_bpf_prog_info(struct feat_fd *ff, FILE *fp)
 	next = rb_first(root);
 
 	if (!next)
-		printf("# bpf_prog_info empty\n");
+		fprintf(fp, "# bpf_prog_info empty\n");
 
 	while (next) {
 		struct bpf_prog_info_node *node;
@@ -2006,10 +2017,14 @@ static void print_bpf_prog_info(struct feat_fd *ff, FILE *fp)
 	}
 
 	up_read(&env->bpf_progs.lock);
+#else
+	fprintf(fp, "# bpf_prog_info missing, no libbpf support\n");
+#endif // HAVE_LIBBPF_SUPPORT
 }
 
-static void print_bpf_btf(struct feat_fd *ff, FILE *fp)
+static void print_bpf_btf(struct feat_fd *ff __maybe_unused, FILE *fp)
 {
+#ifdef HAVE_LIBBPF_SUPPORT
 	struct perf_env *env = &ff->ph->env;
 	struct rb_root *root;
 	struct rb_node *next;
@@ -2031,8 +2046,10 @@ static void print_bpf_btf(struct feat_fd *ff, FILE *fp)
 	}
 
 	up_read(&env->bpf_progs.lock);
-}
+#else
+	fprintf(fp, "# bpf btf data missing, no libbpf support\n");
 #endif // HAVE_LIBBPF_SUPPORT
+}
 
 static void free_event_desc(struct evsel *events)
 {
@@ -2654,14 +2671,17 @@ static int process_e_machine(struct feat_fd *ff, void *data __maybe_unused)
 	return do_read_u32(ff, &ff->ph->env.e_flags);
 }
 
-#ifdef HAVE_LIBTRACEEVENT
-static int process_tracing_data(struct feat_fd *ff, void *data)
+static int process_tracing_data(struct feat_fd *ff __maybe_unused, void *data __maybe_unused)
 {
+#ifdef HAVE_LIBTRACEEVENT
 	ssize_t ret = trace_report(ff->fd, data, false);
 
 	return ret < 0 ? -1 : 0;
-}
+#else
+	pr_err("ERROR: Trying to read tracing data without libtraceevent support.\n");
+	return -1;
 #endif
+}
 
 static int process_build_id(struct feat_fd *ff, void *data __maybe_unused)
 {
@@ -3340,9 +3360,9 @@ static int process_dir_format(struct feat_fd *ff,
 	return do_read_u64(ff, &data->dir.version);
 }
 
-#ifdef HAVE_LIBBPF_SUPPORT
-static int process_bpf_prog_info(struct feat_fd *ff, void *data __maybe_unused)
+static int process_bpf_prog_info(struct feat_fd *ff __maybe_unused, void *data __maybe_unused)
 {
+#ifdef HAVE_LIBBPF_SUPPORT
 	struct bpf_prog_info_node *info_node;
 	struct perf_env *env = &ff->ph->env;
 	struct perf_bpil *info_linear;
@@ -3412,10 +3432,15 @@ out:
 	free(info_node);
 	up_write(&env->bpf_progs.lock);
 	return err;
+#else
+	pr_err("ERROR: Trying to read bpf_prog_info without libbpf support.\n");
+	return -1;
+#endif // HAVE_LIBBPF_SUPPORT
 }
 
-static int process_bpf_btf(struct feat_fd *ff, void *data __maybe_unused)
+static int process_bpf_btf(struct feat_fd *ff  __maybe_unused, void *data __maybe_unused)
 {
+#ifdef HAVE_LIBBPF_SUPPORT
 	struct perf_env *env = &ff->ph->env;
 	struct btf_node *node = NULL;
 	u32 count, i;
@@ -3459,8 +3484,11 @@ out:
 	up_write(&env->bpf_progs.lock);
 	free(node);
 	return err;
-}
+#else
+	pr_err("ERROR: Trying to read btf data without libbpf support.\n");
+	return -1;
 #endif // HAVE_LIBBPF_SUPPORT
+}
 
 static int process_compressed(struct feat_fd *ff,
 			      void *data __maybe_unused)
@@ -3736,9 +3764,7 @@ static int process_cpu_domain_info(struct feat_fd *ff, void *data __maybe_unused
 const struct perf_header_feature_ops feat_ops[HEADER_LAST_FEATURE];
 
 const struct perf_header_feature_ops feat_ops[HEADER_LAST_FEATURE] = {
-#ifdef HAVE_LIBTRACEEVENT
 	FEAT_OPN(TRACING_DATA,	tracing_data,	false),
-#endif
 	FEAT_OPN(BUILD_ID,	build_id,	false),
 	FEAT_OPR(HOSTNAME,	hostname,	false),
 	FEAT_OPR(OSRELEASE,	osrelease,	false),
@@ -3762,10 +3788,8 @@ const struct perf_header_feature_ops feat_ops[HEADER_LAST_FEATURE] = {
 	FEAT_OPR(MEM_TOPOLOGY,	mem_topology,	true),
 	FEAT_OPR(CLOCKID,	clockid,	false),
 	FEAT_OPN(DIR_FORMAT,	dir_format,	false),
-#ifdef HAVE_LIBBPF_SUPPORT
 	FEAT_OPR(BPF_PROG_INFO, bpf_prog_info,  false),
 	FEAT_OPR(BPF_BTF,       bpf_btf,        false),
-#endif
 	FEAT_OPR(COMPRESSED,	compressed,	false),
 	FEAT_OPR(CPU_PMU_CAPS,	cpu_pmu_caps,	false),
 	FEAT_OPR(CLOCK_DATA,	clock_data,	false),

From cdaebccc1cb5c0f635f6db7fb1570f11b5c9f985 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Wed, 1 Apr 2026 09:13:20 -0700
Subject: [PATCH 078/131] perf session: Extra logging for failed to process
 events

Print log information in ordered event processing so that the cause of
finished round failing is clearer. Print the event name along with its
number when an event isn't processed. Add extra detail about where the
failure happened.

The following log lines come from running `perf data convert`. Before:
  0xa250 [0x10]: failed to process type: 80

After:
  0xa250 [0x10]: piped event processing failed for event of type: FEATURE (80)

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/session.c | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 09de5288f9e1..3a911c70cd0e 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -131,10 +131,17 @@ static int ordered_events__deliver_event(struct ordered_events *oe,
 {
 	struct perf_session *session = container_of(oe, struct perf_session,
 						    ordered_events);
+	int ret =  perf_session__deliver_event(session, event->event,
+					       session->tool, event->file_offset,
+					       event->file_path);
 
-	return perf_session__deliver_event(session, event->event,
-					   session->tool, event->file_offset,
-					   event->file_path);
+	if (ret) {
+		pr_err("%#" PRIx64 " [%#x]: ordered event processing failed (%d) for event of type: %s (%d)\n",
+			event->file_offset, event->event->header.size, ret,
+			perf_event__name(event->event->header.type),
+			event->event->header.type);
+	}
+	return ret;
 }
 
 struct perf_session *__perf_session__new(struct perf_data *data,
@@ -2110,8 +2117,10 @@ more:
 	}
 
 	if ((skip = perf_session__process_event(session, event, head, "pipe")) < 0) {
-		pr_err("%#" PRIx64 " [%#x]: failed to process type: %d\n",
-		       head, event->header.size, event->header.type);
+		pr_err("%#" PRIx64 " [%#x]: piped event processing failed for event of type: %s (%d)\n",
+			head, event->header.size,
+			perf_event__name(event->header.type),
+			event->header.type);
 		err = -EINVAL;
 		goto out_err;
 	}
@@ -2225,8 +2234,10 @@ static int __perf_session__process_decomp_events(struct perf_session *session)
 		if (size < sizeof(struct perf_event_header) ||
 		    (skip = perf_session__process_event(session, event, decomp->file_pos,
 							decomp->file_path)) < 0) {
-			pr_err("%#" PRIx64 " [%#x]: failed to process type: %d\n",
-				decomp->file_pos + decomp->head, event->header.size, event->header.type);
+			pr_err("%#" PRIx64 " [%#x]: decompress event processing failed for event of type: %s (%d)\n",
+				decomp->file_pos + decomp->head, event->header.size,
+				perf_event__name(event->header.type),
+				event->header.type);
 			return -EINVAL;
 		}
 
@@ -2382,8 +2393,9 @@ reader__read_event(struct reader *rd, struct perf_session *session,
 	if (size < sizeof(struct perf_event_header) ||
 	    (skip = rd->process(session, event, rd->file_pos, rd->path)) < 0) {
 		errno = -skip;
-		pr_err("%#" PRIx64 " [%#x]: failed to process type: %d [%m]\n",
+		pr_err("%#" PRIx64 " [%#x]: processing failed for event of type: %s (%d) [%m]\n",
 		       rd->file_offset + rd->head, event->header.size,
+		       perf_event__name(event->header.type),
 		       event->header.type);
 		err = skip;
 		goto out;

From 8a4aab17c350f7c2ca7c459a9977f8e18f2878f6 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Wed, 1 Apr 2026 09:13:21 -0700
Subject: [PATCH 079/131] perf header: Refactor pipe mode end marker handling

In non-pipe/data mode the header has a 256-bit bitmap representing
whether a feature is enabled or not. In pipe mode features are written
out in perf_event__synthesize_features as PERF_RECORD_HEADER_FEATURE
events with a special zero sized marker for the last feature. If a new
feature is added the last feature marker event appears as that feature
from old pipe mode perf data. As the event is zero sized it will fail
to be processed and generally terminate perf.

Add a last_feat variable to the header that in non-pipe/data mode is
just HEADER_LAST_FEATURE. In pipe mode compute the last_feat by
handling zero sized feature events, assuming they are the marker and
updating last_feat accordingly. Potentially a feature event could be
zero sized and so still process the feature event, just ignore the
error if it fails.

As perf_event__process_feature can properly handle pipe mode data,
migrate users to it except for report that still wants to group events
and stop header printing with the last feature marker. Make
perf_event__process_feature non-fatal in the case of a newer feature
than this version of perf's HEADER_LAST_FEATURE, which was the
behavior all users wanted.

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/builtin-annotate.c       | 11 +----
 tools/perf/builtin-report.c         | 27 +++++-------
 tools/perf/builtin-script.c         | 11 +----
 tools/perf/util/data-convert-bt.c   |  9 ++--
 tools/perf/util/data-convert-json.c | 12 +----
 tools/perf/util/header.c            | 68 +++++++++++++++++++++++------
 tools/perf/util/header.h            |  4 +-
 tools/perf/util/intel-tpebs.c       | 11 +----
 8 files changed, 75 insertions(+), 78 deletions(-)

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 686ad08561d6..530348b6981b 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -313,15 +313,6 @@ out_put:
 	return ret;
 }
 
-static int process_feature_event(const struct perf_tool *tool __maybe_unused,
-				 struct perf_session *session,
-				 union perf_event *event)
-{
-	if (event->feat.feat_id < HEADER_LAST_FEATURE)
-		return perf_event__process_feature(session, event);
-	return 0;
-}
-
 static int hist_entry__stdio_annotate(struct hist_entry *he,
 				    struct evsel *evsel,
 				    struct perf_annotate *ann)
@@ -875,7 +866,7 @@ int cmd_annotate(int argc, const char **argv)
 	annotate.tool.id_index	= perf_event__process_id_index;
 	annotate.tool.auxtrace_info	= perf_event__process_auxtrace_info;
 	annotate.tool.auxtrace	= perf_event__process_auxtrace;
-	annotate.tool.feature	= process_feature_event;
+	annotate.tool.feature	= perf_event__process_feature;
 	annotate.tool.ordering_requires_timestamps = true;
 
 	annotate.session = perf_session__new(&data, &annotate.tool);
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 343c0ada5ea1..95c0bdba6b11 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -245,25 +245,20 @@ static int process_feature_event(const struct perf_tool *tool,
 				 union perf_event *event)
 {
 	struct report *rep = container_of(tool, struct report, tool);
+	int ret = perf_event__process_feature(tool, session, event);
 
-	if (event->feat.feat_id < HEADER_LAST_FEATURE)
-		return perf_event__process_feature(session, event);
+	if (ret == 0 && event->header.size == sizeof(struct perf_record_header_feature) &&
+	    (int)event->feat.feat_id >= session->header.last_feat) {
+		/*
+		 * (feat_id = HEADER_LAST_FEATURE) is the end marker which means
+		 * all features are received.
+		 */
+		if (rep->header_only)
+			session_done = 1;
 
-	if (event->feat.feat_id != HEADER_LAST_FEATURE) {
-		pr_err("failed: wrong feature ID: %" PRI_lu64 "\n",
-		       event->feat.feat_id);
-		return -1;
-	} else if (rep->header_only) {
-		session_done = 1;
+		setup_forced_leader(rep, session->evlist);
 	}
-
-	/*
-	 * (feat_id = HEADER_LAST_FEATURE) is the end marker which
-	 * means all features are received, now we can force the
-	 * group if needed.
-	 */
-	setup_forced_leader(rep, session->evlist);
-	return 0;
+	return ret;
 }
 
 static int process_sample_event(const struct perf_tool *tool,
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index b005b23f9d8c..622130d3aed4 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -3944,15 +3944,6 @@ int process_cpu_map_event(const struct perf_tool *tool,
 	return set_maps(script);
 }
 
-static int process_feature_event(const struct perf_tool *tool __maybe_unused,
-				 struct perf_session *session,
-				 union perf_event *event)
-{
-	if (event->feat.feat_id < HEADER_LAST_FEATURE)
-		return perf_event__process_feature(session, event);
-	return 0;
-}
-
 static int perf_script__process_auxtrace_info(const struct perf_tool *tool,
 					      struct perf_session *session,
 					      union perf_event *event)
@@ -4427,7 +4418,7 @@ script_found:
 #ifdef HAVE_LIBTRACEEVENT
 	script.tool.tracing_data	 = perf_event__process_tracing_data;
 #endif
-	script.tool.feature		 = process_feature_event;
+	script.tool.feature		 = perf_event__process_feature;
 	script.tool.build_id		 = perf_event__process_build_id;
 	script.tool.id_index		 = perf_event__process_id_index;
 	script.tool.auxtrace_info	 = perf_script__process_auxtrace_info;
diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c
index ba1c8e48d495..665bf8eea24b 100644
--- a/tools/perf/util/data-convert-bt.c
+++ b/tools/perf/util/data-convert-bt.c
@@ -1412,13 +1412,10 @@ static int process_feature_event(const struct perf_tool *tool,
 	struct convert *c = container_of(tool, struct convert, tool);
 	struct ctf_writer *cw = &c->writer;
 	struct perf_record_header_feature *fe = &event->feat;
+	int ret = perf_event__process_feature(tool, session, event);
 
-	if (event->feat.feat_id < HEADER_LAST_FEATURE) {
-		int ret = perf_event__process_feature(session, event);
-
-		if (ret)
-			return ret;
-	}
+	if (ret)
+		return ret;
 
 	switch (fe->feat_id) {
 	case HEADER_HOSTNAME:
diff --git a/tools/perf/util/data-convert-json.c b/tools/perf/util/data-convert-json.c
index 6a626322476a..4b1b2f7bed25 100644
--- a/tools/perf/util/data-convert-json.c
+++ b/tools/perf/util/data-convert-json.c
@@ -326,16 +326,6 @@ static void output_headers(struct perf_session *session, struct convert_json *c)
 	output_json_format(out, false, 2, "]");
 }
 
-static int process_feature_event(const struct perf_tool *tool __maybe_unused,
-				 struct perf_session *session,
-				 union perf_event *event)
-{
-	if (event->feat.feat_id < HEADER_LAST_FEATURE)
-		return perf_event__process_feature(session, event);
-
-	return 0;
-}
-
 int bt_convert__perf2json(const char *input_name, const char *output_name,
 		struct perf_data_convert_opts *opts __maybe_unused)
 {
@@ -375,7 +365,7 @@ int bt_convert__perf2json(const char *input_name, const char *output_name,
 	c.tool.auxtrace       = perf_event__process_auxtrace;
 	c.tool.event_update   = perf_event__process_event_update;
 	c.tool.attr           = perf_event__process_attr;
-	c.tool.feature        = process_feature_event;
+	c.tool.feature        = perf_event__process_feature;
 	c.tool.ordering_requires_timestamps = true;
 
 	if (opts->all) {
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 9f1fe35a6b8a..ad7d09a481bb 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -3819,11 +3819,11 @@ static int perf_file_section__fprintf_info(struct perf_file_section *section,
 	struct feat_fd ff;
 
 	if (lseek(fd, section->offset, SEEK_SET) == (off_t)-1) {
-		pr_debug("Failed to lseek to %" PRIu64 " offset for feature "
-				"%d, continuing...\n", section->offset, feat);
+		pr_debug("Failed to lseek to %" PRIu64 " offset for feature %s (%d), continuing...\n",
+			 section->offset, header_feat__name(feat), feat);
 		return 0;
 	}
-	if (feat >= HEADER_LAST_FEATURE) {
+	if (feat >= ph->last_feat) {
 		pr_warning("unknown feature %d\n", feat);
 		return 0;
 	}
@@ -3875,7 +3875,7 @@ int perf_header__fprintf_info(struct perf_session *session, FILE *fp, bool full)
 		return 0;
 
 	fprintf(fp, "# missing features: ");
-	for_each_clear_bit(bit, header->adds_features, HEADER_LAST_FEATURE) {
+	for_each_clear_bit(bit, header->adds_features, header->last_feat) {
 		if (bit)
 			fprintf(fp, "%s ", feat_ops[bit].name);
 	}
@@ -4205,7 +4205,7 @@ int perf_header__process_sections(struct perf_header *header, int fd,
 	if (err < 0)
 		goto out_free;
 
-	for_each_set_bit(feat, header->adds_features, HEADER_LAST_FEATURE) {
+	for_each_set_bit(feat, header->adds_features, header->last_feat) {
 		err = process(sec++, header, feat, fd, data);
 		if (err < 0)
 			goto out_free;
@@ -4420,6 +4420,7 @@ int perf_file_header__read(struct perf_file_header *header,
 	ph->data_offset  = header->data.offset;
 	ph->data_size	 = header->data.size;
 	ph->feat_offset  = header->data.offset + header->data.size;
+	ph->last_feat	 = HEADER_LAST_FEATURE;
 	return 0;
 }
 
@@ -4435,8 +4436,8 @@ static int perf_file_section__process(struct perf_file_section *section,
 	};
 
 	if (lseek(fd, section->offset, SEEK_SET) == (off_t)-1) {
-		pr_debug("Failed to lseek to %" PRIu64 " offset for feature "
-			  "%d, continuing...\n", section->offset, feat);
+		pr_debug("Failed to lseek to %" PRIu64 " offset for feature %s (%d), continuing...\n",
+			 section->offset, header_feat__name(feat), feat);
 		return 0;
 	}
 
@@ -4469,6 +4470,8 @@ static int perf_file_header__read_pipe(struct perf_pipe_file_header *header,
 	if (ph->needs_swap)
 		header->size = bswap_64(header->size);
 
+	/* The last feature is written out as a 0 sized event and will update this value. */
+	ph->last_feat = 0;
 	return 0;
 }
 
@@ -4701,31 +4704,68 @@ out_delete_evlist:
 	return -ENOMEM;
 }
 
-int perf_event__process_feature(struct perf_session *session,
+int perf_event__process_feature(const struct perf_tool *tool __maybe_unused,
+				struct perf_session *session,
 				union perf_event *event)
 {
 	struct feat_fd ff = { .fd = 0 };
 	struct perf_record_header_feature *fe = (struct perf_record_header_feature *)event;
+	struct perf_header *header = &session->header;
 	int type = fe->header.type;
-	u64 feat = fe->feat_id;
+	int feat = (int)fe->feat_id;
 	int ret = 0;
 	bool print = dump_trace;
+	bool last_feature_mark = false;
 
 	if (type < 0 || type >= PERF_RECORD_HEADER_MAX) {
 		pr_warning("invalid record type %d in pipe-mode\n", type);
 		return 0;
 	}
-	if (feat == HEADER_RESERVED || feat >= HEADER_LAST_FEATURE) {
-		pr_warning("invalid record type %d in pipe-mode\n", type);
+	if (feat == HEADER_RESERVED) {
+		pr_warning("invalid reserved record type in pipe-mode\n");
+		return -1;
+	}
+	if (feat < 0 || feat == INT_MAX) {
+		pr_warning("invalid value for feature type %x\n", feat);
+		return -1;
+	}
+	if (feat >= header->last_feat) {
+		if (event->header.size == sizeof(*fe)) {
+			/*
+			 * Either an unexpected zero size feature or the
+			 * HEADER_LAST_FEATURE mark.
+			 */
+			if (feat > header->last_feat)
+				header->last_feat = min(feat, HEADER_LAST_FEATURE);
+			last_feature_mark = true;
+		} else {
+			/*
+			 * A feature but beyond what is known as in
+			 * bounds. Assume the last feature is 1 beyond this
+			 * feature.
+			 */
+			session->header.last_feat = min(feat + 1, HEADER_LAST_FEATURE);
+		}
+	}
+	if (feat >= HEADER_LAST_FEATURE) {
+		if (!last_feature_mark) {
+			pr_warning("unknown feature %d for data file version (%s) in this version of perf (%s)\n",
+				   feat, header->env.version, perf_version_string);
+		}
+		return 0;
+	}
+	if (event->header.size < sizeof(*fe)) {
+		pr_warning("feature header size too small\n");
 		return -1;
 	}
-
 	ff.buf  = (void *)fe->data;
 	ff.size = event->header.size - sizeof(*fe);
-	ff.ph = &session->header;
+	ff.ph = header;
 
 	if (feat_ops[feat].process && feat_ops[feat].process(&ff, NULL)) {
-		ret = -1;
+		// Processing failed, ignore when this is the last feature mark.
+		if (!last_feature_mark)
+			ret = -1;
 		goto out;
 	}
 
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index ca22030a1434..41ce663d93ff 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -109,6 +109,7 @@ struct perf_header {
 	u64				data_size;
 	u64				feat_offset;
 	DECLARE_BITMAP(adds_features, HEADER_FEAT_BITS);
+	int				last_feat;
 	struct perf_env 	env;
 };
 
@@ -172,7 +173,8 @@ int perf_header__process_sections(struct perf_header *header, int fd,
 
 int perf_header__fprintf_info(struct perf_session *s, FILE *fp, bool full);
 
-int perf_event__process_feature(struct perf_session *session,
+int perf_event__process_feature(const struct perf_tool *tool,
+				struct perf_session *session,
 				union perf_event *event);
 int perf_event__process_attr(const struct perf_tool *tool, union perf_event *event,
 			     struct evlist **pevlist);
diff --git a/tools/perf/util/intel-tpebs.c b/tools/perf/util/intel-tpebs.c
index 2af5455488b2..8b615dc94e9e 100644
--- a/tools/perf/util/intel-tpebs.c
+++ b/tools/perf/util/intel-tpebs.c
@@ -216,15 +216,6 @@ static int process_sample_event(const struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
-static int process_feature_event(const struct perf_tool *tool __maybe_unused,
-				 struct perf_session *session,
-				 union perf_event *event)
-{
-	if (event->feat.feat_id < HEADER_LAST_FEATURE)
-		return perf_event__process_feature(session, event);
-	return 0;
-}
-
 static void *__sample_reader(void *arg __maybe_unused)
 {
 	struct perf_session *session;
@@ -237,7 +228,7 @@ static void *__sample_reader(void *arg __maybe_unused)
 
 	perf_tool__init(&tool, /*ordered_events=*/false);
 	tool.sample = process_sample_event;
-	tool.feature = process_feature_event;
+	tool.feature = perf_event__process_feature;
 	tool.attr = perf_event__process_attr;
 
 	session = perf_session__new(&data, &tool);

From fbfdf3143271ca695061fa5882651bb512832044 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Wed, 1 Apr 2026 09:13:22 -0700
Subject: [PATCH 080/131] perf ordered-events: Event processing consistency
 with the regular reader

Some event processing functions like perf_event__process_tracing_data
return a zero or positive value on success. Ordered event processing
handles any non-zero value as an error, which is inconsistent with
reader__process_events and reader__read_event that only treat negative
values as errors. Make the ordered events error handling consistent
with that of the events reader.

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/ordered-events.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/ordered-events.c b/tools/perf/util/ordered-events.c
index 8c62611f10aa..a5857f9f5af2 100644
--- a/tools/perf/util/ordered-events.c
+++ b/tools/perf/util/ordered-events.c
@@ -243,7 +243,7 @@ static int do_flush(struct ordered_events *oe, bool show_progress)
 		if (iter->timestamp > limit)
 			break;
 		ret = oe->deliver(oe, iter);
-		if (ret)
+		if (ret < 0)
 			return ret;
 
 		ordered_events__delete(oe, iter);

From b1e814f860c758c289dc63825caf322e2cb5e298 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Wed, 1 Apr 2026 09:13:23 -0700
Subject: [PATCH 081/131] perf evsel: Make unknown event names more unique

In situations like the perf data converter the evsel__name will be
used to create babeltrace events. If the events have the same name
then creation can fail. Avoid these failures by including more
information into the unknown event names.

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/evsel.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 5a294595a677..1281af056cec 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -926,7 +926,8 @@ const char *evsel__name(struct evsel *evsel)
 		break;
 
 	case PERF_TYPE_TRACEPOINT:
-		scnprintf(bf, sizeof(bf), "%s", "unknown tracepoint");
+		scnprintf(bf, sizeof(bf), "unknown tracepoint id=%#"PRIx64,
+			  evsel->core.attr.config);
 		break;
 
 	case PERF_TYPE_BREAKPOINT:
@@ -938,8 +939,8 @@ const char *evsel__name(struct evsel *evsel)
 		break;
 
 	default:
-		scnprintf(bf, sizeof(bf), "unknown attr type: %d",
-			  evsel->core.attr.type);
+		scnprintf(bf, sizeof(bf), "unknown event PMU=%d config=%#"PRIx64,
+			  evsel->core.attr.type, evsel->core.attr.config);
 		break;
 	}
 

From 43c0901edaabb59f94d7f136be9b6afcfbc36df8 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Wed, 1 Apr 2026 09:13:24 -0700
Subject: [PATCH 082/131] perf data convert ctf: Pipe mode improvements

Handle the finished_round event. Set up the CTF events when the
feature event desc is read. In pipe mode the attr events will create
the evsels and the feature event desc events will name the evsels. The
CTF events need the evsel name, so wait until feature event descs are
read (in pipe mode) before setting up the events except for tracepoint
events. Handle the tracing_data event so that tracepoint information
is available when setting up tracepoint events.

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/data-convert-bt.c | 63 +++++++++++++++++++++++++++++--
 1 file changed, 59 insertions(+), 4 deletions(-)

diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c
index 665bf8eea24b..bece77cbc493 100644
--- a/tools/perf/util/data-convert-bt.c
+++ b/tools/perf/util/data-convert-bt.c
@@ -1181,6 +1181,10 @@ static int add_event(struct ctf_writer *cw, struct evsel *evsel)
 	const char *name = evsel__name(evsel);
 	int ret;
 
+	if (evsel->priv) {
+		pr_err("Error: attempt to add already added event %s\n", name);
+		return -1;
+	}
 	pr("Adding event '%s' (type %d)\n", name, evsel->core.attr.type);
 
 	event_class = bt_ctf_event_class_create(name);
@@ -1223,13 +1227,28 @@ err:
 	return -1;
 }
 
-static int setup_events(struct ctf_writer *cw, struct perf_session *session)
+enum setup_events_type {
+	SETUP_EVENTS_ALL,
+	SETUP_EVENTS_NOT_TRACEPOINT,
+	SETUP_EVENTS_TRACEPOINT_ONLY,
+};
+
+static int setup_events(struct ctf_writer *cw, struct perf_session *session,
+			enum setup_events_type type)
 {
 	struct evlist *evlist = session->evlist;
 	struct evsel *evsel;
 	int ret;
 
 	evlist__for_each_entry(evlist, evsel) {
+		bool is_tracepoint = evsel->core.attr.type == PERF_TYPE_TRACEPOINT;
+
+		if (is_tracepoint && type == SETUP_EVENTS_NOT_TRACEPOINT)
+			continue;
+
+		if (!is_tracepoint && type == SETUP_EVENTS_TRACEPOINT_ONLY)
+			continue;
+
 		ret = add_event(cw, evsel);
 		if (ret)
 			return ret;
@@ -1418,6 +1437,18 @@ static int process_feature_event(const struct perf_tool *tool,
 		return ret;
 
 	switch (fe->feat_id) {
+	case HEADER_EVENT_DESC:
+		/*
+		 * In non-pipe mode (not here) the evsels combine the desc with
+		 * the perf_event_attr when it is parsed. In pipe mode the
+		 * perf_event_attr events appear first and then the event desc
+		 * feature events that set the names appear after. Once we have
+		 * the full evsel data we can generate the babeltrace
+		 * events. For tracepoint events we still don't have the tracing
+		 * data and so need to wait until the tracing data event to add
+		 * those events to babeltrace.
+		 */
+		return setup_events(cw, session, SETUP_EVENTS_NOT_TRACEPOINT);
 	case HEADER_HOSTNAME:
 		if (session->header.env.hostname) {
 			return bt_ctf_writer_add_environment_field(cw->writer, "host",
@@ -1448,6 +1479,26 @@ static int process_feature_event(const struct perf_tool *tool,
 	return 0;
 }
 
+static int process_tracing_data(const struct perf_tool *tool,
+				struct perf_session *session,
+				union perf_event *event)
+{
+	struct convert *c = container_of(tool, struct convert, tool);
+	struct ctf_writer *cw = &c->writer;
+	int ret;
+
+	ret = perf_event__process_tracing_data(tool, session, event);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Now the attr was set up by the attr event, the name by the feature
+	 * event desc event and the tracepoint data set up above, the tracepoint
+	 * babeltrace events can be added.
+	 */
+	return setup_events(cw, session, SETUP_EVENTS_TRACEPOINT_ONLY);
+}
+
 static int ctf_writer__setup_clock(struct ctf_writer *cw,
 				   struct perf_session *session,
 				   bool tod)
@@ -1677,9 +1728,10 @@ int bt_convert__perf2ctf(const char *input, const char *path,
 	c.tool.exit            = perf_event__process_exit;
 	c.tool.fork            = perf_event__process_fork;
 	c.tool.lost            = perf_event__process_lost;
-	c.tool.tracing_data    = perf_event__process_tracing_data;
+	c.tool.tracing_data    = process_tracing_data;
 	c.tool.build_id        = perf_event__process_build_id;
 	c.tool.namespaces      = perf_event__process_namespaces;
+	c.tool.finished_round  = perf_event__process_finished_round;
 	c.tool.attr            = perf_event__process_attr;
 	c.tool.feature         = process_feature_event;
 	c.tool.ordering_requires_timestamps = true;
@@ -1724,8 +1776,11 @@ int bt_convert__perf2ctf(const char *input, const char *path,
 	if (ctf_writer__setup_env(cw, session))
 		goto free_writer;
 
-	/* CTF events setup */
-	if (setup_events(cw, session))
+	/*
+	 * CTF events setup. Note, in pipe mode no events exist yet (they come
+	 * in via header feature events) and so this does nothing.
+	 */
+	if (setup_events(cw, session, SETUP_EVENTS_ALL))
 		goto free_writer;
 
 	if (opts->all && setup_non_sample_events(cw, session))

From aa0c2bb09bdc5423aa6a0da41762ea0703ed567c Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 9 Dec 2025 09:08:11 -0800
Subject: [PATCH 083/131] perf tests kwork: Add basic kwork coverage tests

Add basic kwork coverage tests for record, report, latency, timehist
and top.

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/tests/shell/kwork.sh | 79 +++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100755 tools/perf/tests/shell/kwork.sh

diff --git a/tools/perf/tests/shell/kwork.sh b/tools/perf/tests/shell/kwork.sh
new file mode 100755
index 000000000000..42bfd9382816
--- /dev/null
+++ b/tools/perf/tests/shell/kwork.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+# perf kwork tests
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+
+# Root permissions required for tracing events.
+if [ "$(id -u)" != 0 ]; then
+	echo "[Skip] No root permission"
+	exit 2
+fi
+
+err=0
+perfdata=$(mktemp /tmp/__perf_test_kwork.perf.data.XXXXX)
+
+cleanup() {
+	rm -f "${perfdata}"
+	rm -f "${perfdata}".old
+
+	trap - EXIT TERM INT
+}
+
+trap_cleanup() {
+	echo "Unexpected signal in ${FUNCNAME[1]}"
+	cleanup
+	exit 1
+}
+trap trap_cleanup EXIT TERM INT
+
+test_kwork_record() {
+	echo "Kwork record"
+	perf kwork record -o "${perfdata}" -- sleep 1
+	echo "Kwork record [Success]"
+}
+
+test_kwork_report() {
+	echo "Kwork report"
+	if ! perf kwork report -i "${perfdata}" | grep -q "Kwork Name"; then
+		echo "Kwork report [Failed missing output]"
+		err=1
+	fi
+	echo "Kwork report [Success]"
+}
+
+test_kwork_latency() {
+	echo "Kwork latency"
+	if ! perf kwork latency -i "${perfdata}" | grep -q "Avg delay"; then
+		echo "Kwork latency [Failed missing output]"
+		err=1
+	fi
+	echo "Kwork latency [Success]"
+}
+
+test_kwork_timehist() {
+	echo "Kwork timehist"
+	if ! perf kwork timehist -i "${perfdata}" | grep -q "Kwork name"; then
+		echo "Kwork timehist [Failed missing output]"
+		err=1
+	fi
+	echo "Kwork timehist [Success]"
+}
+
+test_kwork_top() {
+	echo "Kwork top"
+	if ! perf kwork top -i "${perfdata}" | grep -q "COMMAND"; then
+		echo "Kwork top [Failed missing output]"
+		err=1
+	fi
+	echo "Kwork top [Success]"
+}
+
+test_kwork_record
+test_kwork_report
+test_kwork_latency
+test_kwork_timehist
+test_kwork_top
+
+cleanup
+exit $err

From 210259987d9a7bb8506f3e93c2ddbece15c13b15 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Wed, 18 Mar 2026 18:01:03 -0700
Subject: [PATCH 084/131] perf metrics: Make common stalled metrics conditional
 on having the event

The metric code uses the event parsing code but it generally assumes
all events are supported. Arnaldo reported AMD supporting
stalled-cycles-frontend but not stalled-cycles-backend [1]. An issue
with this is that before parsing happens the metric code tries to
share events within groups to reduce the number of events and
multiplexing. If the group has some supported and not supported
events, the whole group will become broken. To avoid this situation
add has_event tests to the metrics for stalled-cycles-frontend and
stalled-cycles-backend. has_events is evaluated when parsing the
metric and its result constant propagated (with if-elses) to reduce
the number of events. This means when the metric code considers
sharing the events, only supported events will be shared.

Note for backporting. This change updates
tools/perf/pmu-events/empty-pmu-events.c a convenience file for builds
on systems without python present. While the metrics.json code should
backport easily there can be conflicts on empty-pmu-events.c. In this
case the build will have left a file test-empty-pmu-events.c that can
be copied over empty-pmu-events.c to resolve issues and make an
appropriate empty-pmu-events.c for the json in the source tree at the
time of the build.

[1] https://lore.kernel.org/lkml/abm1nR-2xjOUBroD@x1/

Reported-by: Arnaldo Carvalho de Melo <acme@kernel.org>
Closes: https://lore.kernel.org/lkml/abm1nR-2xjOUBroD@x1/
Fixes: c7adeb0974f1 ("perf jevents: Add set of common metrics based on default ones")
Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 .../arch/common/common/metrics.json           |   6 +-
 tools/perf/pmu-events/empty-pmu-events.c      | 108 +++++++++---------
 2 files changed, 57 insertions(+), 57 deletions(-)

diff --git a/tools/perf/pmu-events/arch/common/common/metrics.json b/tools/perf/pmu-events/arch/common/common/metrics.json
index 0d010b3ebc6d..cefc8bfe7830 100644
--- a/tools/perf/pmu-events/arch/common/common/metrics.json
+++ b/tools/perf/pmu-events/arch/common/common/metrics.json
@@ -46,14 +46,14 @@
     },
     {
         "BriefDescription": "Max front or backend stalls per instruction",
-        "MetricExpr": "max(stalled\\-cycles\\-frontend, stalled\\-cycles\\-backend) / instructions",
+        "MetricExpr": "(max(stalled\\-cycles\\-frontend, stalled\\-cycles\\-backend) / instructions) if (has_event(stalled\\-cycles\\-frontend) & has_event(stalled\\-cycles\\-backend)) else ((stalled\\-cycles\\-frontend / instructions) if has_event(stalled\\-cycles\\-frontend) else ((stalled\\-cycles\\-backend / instructions) if has_event(stalled\\-cycles\\-backend) else 0))",
         "MetricGroup": "Default",
         "MetricName": "stalled_cycles_per_instruction",
         "DefaultShowEvents": "1"
     },
     {
         "BriefDescription": "Frontend stalls per cycle",
-        "MetricExpr": "stalled\\-cycles\\-frontend / cpu\\-cycles",
+        "MetricExpr": "(stalled\\-cycles\\-frontend / cpu\\-cycles) if has_event(stalled\\-cycles\\-frontend) else 0",
         "MetricGroup": "Default",
         "MetricName": "frontend_cycles_idle",
         "MetricThreshold": "frontend_cycles_idle > 0.1",
@@ -61,7 +61,7 @@
     },
     {
         "BriefDescription": "Backend stalls per cycle",
-        "MetricExpr": "stalled\\-cycles\\-backend / cpu\\-cycles",
+        "MetricExpr": "(stalled\\-cycles\\-backend / cpu\\-cycles) if has_event(stalled\\-cycles\\-backend) else 0",
         "MetricGroup": "Default",
         "MetricName": "backend_cycles_idle",
         "MetricThreshold": "backend_cycles_idle > 0.2",
diff --git a/tools/perf/pmu-events/empty-pmu-events.c b/tools/perf/pmu-events/empty-pmu-events.c
index 76c395cf513c..a92dd0424f79 100644
--- a/tools/perf/pmu-events/empty-pmu-events.c
+++ b/tools/perf/pmu-events/empty-pmu-events.c
@@ -1310,33 +1310,33 @@ static const char *const big_c_string =
 /* offset=128375 */ "migrations_per_second\000Default\000software@cpu\\-migrations\\,name\\=cpu\\-migrations@ * 1e9 / (software@cpu\\-clock\\,name\\=cpu\\-clock@ if #target_cpu else software@task\\-clock\\,name\\=task\\-clock@)\000\000Process migrations to a new CPU per CPU second\000\0001migrations/sec\000\000\000\000011"
 /* offset=128635 */ "page_faults_per_second\000Default\000software@page\\-faults\\,name\\=page\\-faults@ * 1e9 / (software@cpu\\-clock\\,name\\=cpu\\-clock@ if #target_cpu else software@task\\-clock\\,name\\=task\\-clock@)\000\000Page faults per CPU second\000\0001faults/sec\000\000\000\000011"
 /* offset=128866 */ "insn_per_cycle\000Default\000instructions / cpu\\-cycles\000insn_per_cycle < 1\000Instructions Per Cycle\000\0001instructions\000\000\000\000001"
-/* offset=128979 */ "stalled_cycles_per_instruction\000Default\000max(stalled\\-cycles\\-frontend, stalled\\-cycles\\-backend) / instructions\000\000Max front or backend stalls per instruction\000\000\000\000\000\000001"
-/* offset=129143 */ "frontend_cycles_idle\000Default\000stalled\\-cycles\\-frontend / cpu\\-cycles\000frontend_cycles_idle > 0.1\000Frontend stalls per cycle\000\000\000\000\000\000001"
-/* offset=129273 */ "backend_cycles_idle\000Default\000stalled\\-cycles\\-backend / cpu\\-cycles\000backend_cycles_idle > 0.2\000Backend stalls per cycle\000\000\000\000\000\000001"
-/* offset=129399 */ "cycles_frequency\000Default\000cpu\\-cycles / (software@cpu\\-clock\\,name\\=cpu\\-clock@ if #target_cpu else software@task\\-clock\\,name\\=task\\-clock@)\000\000Cycles per CPU second\000\0001GHz\000\000\000\000011"
-/* offset=129575 */ "branch_frequency\000Default\000branches / (software@cpu\\-clock\\,name\\=cpu\\-clock@ if #target_cpu else software@task\\-clock\\,name\\=task\\-clock@)\000\000Branches per CPU second\000\0001000M/sec\000\000\000\000011"
-/* offset=129755 */ "branch_miss_rate\000Default\000branch\\-misses / branches\000branch_miss_rate > 0.05\000Branch miss rate\000\000100%\000\000\000\000001"
-/* offset=129859 */ "l1d_miss_rate\000Default2\000L1\\-dcache\\-load\\-misses / L1\\-dcache\\-loads\000l1d_miss_rate > 0.05\000L1D  miss rate\000\000100%\000\000\000\000001"
-/* offset=129975 */ "llc_miss_rate\000Default2\000LLC\\-load\\-misses / LLC\\-loads\000llc_miss_rate > 0.05\000LLC miss rate\000\000100%\000\000\000\000001"
-/* offset=130076 */ "l1i_miss_rate\000Default3\000L1\\-icache\\-load\\-misses / L1\\-icache\\-loads\000l1i_miss_rate > 0.05\000L1I miss rate\000\000100%\000\000\000\000001"
-/* offset=130191 */ "dtlb_miss_rate\000Default3\000dTLB\\-load\\-misses / dTLB\\-loads\000dtlb_miss_rate > 0.05\000dTLB miss rate\000\000100%\000\000\000\000001"
-/* offset=130297 */ "itlb_miss_rate\000Default3\000iTLB\\-load\\-misses / iTLB\\-loads\000itlb_miss_rate > 0.05\000iTLB miss rate\000\000100%\000\000\000\000001"
-/* offset=130403 */ "l1_prefetch_miss_rate\000Default4\000L1\\-dcache\\-prefetch\\-misses / L1\\-dcache\\-prefetches\000l1_prefetch_miss_rate > 0.05\000L1 prefetch miss rate\000\000100%\000\000\000\000001"
-/* offset=130551 */ "CPI\000\0001 / IPC\000\000\000\000\000\000\000\000000"
-/* offset=130574 */ "IPC\000group1\000inst_retired.any / cpu_clk_unhalted.thread\000\000\000\000\000\000\000\000000"
-/* offset=130638 */ "Frontend_Bound_SMT\000\000idq_uops_not_delivered.core / (4 * (cpu_clk_unhalted.thread / 2 * (1 + cpu_clk_unhalted.one_thread_active / cpu_clk_unhalted.ref_xclk)))\000\000\000\000\000\000\000\000000"
-/* offset=130805 */ "dcache_miss_cpi\000\000l1d\\-loads\\-misses / inst_retired.any\000\000\000\000\000\000\000\000000"
-/* offset=130870 */ "icache_miss_cycles\000\000l1i\\-loads\\-misses / inst_retired.any\000\000\000\000\000\000\000\000000"
-/* offset=130938 */ "cache_miss_cycles\000group1\000dcache_miss_cpi + icache_miss_cycles\000\000\000\000\000\000\000\000000"
-/* offset=131010 */ "DCache_L2_All_Hits\000\000l2_rqsts.demand_data_rd_hit + l2_rqsts.pf_hit + l2_rqsts.rfo_hit\000\000\000\000\000\000\000\000000"
-/* offset=131105 */ "DCache_L2_All_Miss\000\000max(l2_rqsts.all_demand_data_rd - l2_rqsts.demand_data_rd_hit, 0) + l2_rqsts.pf_miss + l2_rqsts.rfo_miss\000\000\000\000\000\000\000\000000"
-/* offset=131240 */ "DCache_L2_All\000\000DCache_L2_All_Hits + DCache_L2_All_Miss\000\000\000\000\000\000\000\000000"
-/* offset=131305 */ "DCache_L2_Hits\000\000d_ratio(DCache_L2_All_Hits, DCache_L2_All)\000\000\000\000\000\000\000\000000"
-/* offset=131374 */ "DCache_L2_Misses\000\000d_ratio(DCache_L2_All_Miss, DCache_L2_All)\000\000\000\000\000\000\000\000000"
-/* offset=131445 */ "M1\000\000ipc + M2\000\000\000\000\000\000\000\000000"
-/* offset=131468 */ "M2\000\000ipc + M1\000\000\000\000\000\000\000\000000"
-/* offset=131491 */ "M3\000\0001 / M3\000\000\000\000\000\000\000\000000"
-/* offset=131512 */ "L1D_Cache_Fill_BW\000\00064 * l1d.replacement / 1e9 / duration_time\000\000\000\000\000\000\000\000000"
+/* offset=128979 */ "stalled_cycles_per_instruction\000Default\000(max(stalled\\-cycles\\-frontend, stalled\\-cycles\\-backend) / instructions if has_event(stalled\\-cycles\\-frontend) & has_event(stalled\\-cycles\\-backend) else (stalled\\-cycles\\-frontend / instructions if has_event(stalled\\-cycles\\-frontend) else (stalled\\-cycles\\-backend / instructions if has_event(stalled\\-cycles\\-backend) else 0)))\000\000Max front or backend stalls per instruction\000\000\000\000\000\000001"
+/* offset=129404 */ "frontend_cycles_idle\000Default\000(stalled\\-cycles\\-frontend / cpu\\-cycles if has_event(stalled\\-cycles\\-frontend) else 0)\000frontend_cycles_idle > 0.1\000Frontend stalls per cycle\000\000\000\000\000\000001"
+/* offset=129583 */ "backend_cycles_idle\000Default\000(stalled\\-cycles\\-backend / cpu\\-cycles if has_event(stalled\\-cycles\\-backend) else 0)\000backend_cycles_idle > 0.2\000Backend stalls per cycle\000\000\000\000\000\000001"
+/* offset=129757 */ "cycles_frequency\000Default\000cpu\\-cycles / (software@cpu\\-clock\\,name\\=cpu\\-clock@ if #target_cpu else software@task\\-clock\\,name\\=task\\-clock@)\000\000Cycles per CPU second\000\0001GHz\000\000\000\000011"
+/* offset=129933 */ "branch_frequency\000Default\000branches / (software@cpu\\-clock\\,name\\=cpu\\-clock@ if #target_cpu else software@task\\-clock\\,name\\=task\\-clock@)\000\000Branches per CPU second\000\0001000M/sec\000\000\000\000011"
+/* offset=130113 */ "branch_miss_rate\000Default\000branch\\-misses / branches\000branch_miss_rate > 0.05\000Branch miss rate\000\000100%\000\000\000\000001"
+/* offset=130217 */ "l1d_miss_rate\000Default2\000L1\\-dcache\\-load\\-misses / L1\\-dcache\\-loads\000l1d_miss_rate > 0.05\000L1D  miss rate\000\000100%\000\000\000\000001"
+/* offset=130333 */ "llc_miss_rate\000Default2\000LLC\\-load\\-misses / LLC\\-loads\000llc_miss_rate > 0.05\000LLC miss rate\000\000100%\000\000\000\000001"
+/* offset=130434 */ "l1i_miss_rate\000Default3\000L1\\-icache\\-load\\-misses / L1\\-icache\\-loads\000l1i_miss_rate > 0.05\000L1I miss rate\000\000100%\000\000\000\000001"
+/* offset=130549 */ "dtlb_miss_rate\000Default3\000dTLB\\-load\\-misses / dTLB\\-loads\000dtlb_miss_rate > 0.05\000dTLB miss rate\000\000100%\000\000\000\000001"
+/* offset=130655 */ "itlb_miss_rate\000Default3\000iTLB\\-load\\-misses / iTLB\\-loads\000itlb_miss_rate > 0.05\000iTLB miss rate\000\000100%\000\000\000\000001"
+/* offset=130761 */ "l1_prefetch_miss_rate\000Default4\000L1\\-dcache\\-prefetch\\-misses / L1\\-dcache\\-prefetches\000l1_prefetch_miss_rate > 0.05\000L1 prefetch miss rate\000\000100%\000\000\000\000001"
+/* offset=130909 */ "CPI\000\0001 / IPC\000\000\000\000\000\000\000\000000"
+/* offset=130932 */ "IPC\000group1\000inst_retired.any / cpu_clk_unhalted.thread\000\000\000\000\000\000\000\000000"
+/* offset=130996 */ "Frontend_Bound_SMT\000\000idq_uops_not_delivered.core / (4 * (cpu_clk_unhalted.thread / 2 * (1 + cpu_clk_unhalted.one_thread_active / cpu_clk_unhalted.ref_xclk)))\000\000\000\000\000\000\000\000000"
+/* offset=131163 */ "dcache_miss_cpi\000\000l1d\\-loads\\-misses / inst_retired.any\000\000\000\000\000\000\000\000000"
+/* offset=131228 */ "icache_miss_cycles\000\000l1i\\-loads\\-misses / inst_retired.any\000\000\000\000\000\000\000\000000"
+/* offset=131296 */ "cache_miss_cycles\000group1\000dcache_miss_cpi + icache_miss_cycles\000\000\000\000\000\000\000\000000"
+/* offset=131368 */ "DCache_L2_All_Hits\000\000l2_rqsts.demand_data_rd_hit + l2_rqsts.pf_hit + l2_rqsts.rfo_hit\000\000\000\000\000\000\000\000000"
+/* offset=131463 */ "DCache_L2_All_Miss\000\000max(l2_rqsts.all_demand_data_rd - l2_rqsts.demand_data_rd_hit, 0) + l2_rqsts.pf_miss + l2_rqsts.rfo_miss\000\000\000\000\000\000\000\000000"
+/* offset=131598 */ "DCache_L2_All\000\000DCache_L2_All_Hits + DCache_L2_All_Miss\000\000\000\000\000\000\000\000000"
+/* offset=131663 */ "DCache_L2_Hits\000\000d_ratio(DCache_L2_All_Hits, DCache_L2_All)\000\000\000\000\000\000\000\000000"
+/* offset=131732 */ "DCache_L2_Misses\000\000d_ratio(DCache_L2_All_Miss, DCache_L2_All)\000\000\000\000\000\000\000\000000"
+/* offset=131803 */ "M1\000\000ipc + M2\000\000\000\000\000\000\000\000000"
+/* offset=131826 */ "M2\000\000ipc + M1\000\000\000\000\000\000\000\000000"
+/* offset=131849 */ "M3\000\0001 / M3\000\000\000\000\000\000\000\000000"
+/* offset=131870 */ "L1D_Cache_Fill_BW\000\00064 * l1d.replacement / 1e9 / duration_time\000\000\000\000\000\000\000\000000"
 ;
 
 static const struct compact_pmu_event pmu_events__common_default_core[] = {
@@ -2626,22 +2626,22 @@ static const struct pmu_table_entry pmu_events__common[] = {
 
 static const struct compact_pmu_event pmu_metrics__common_default_core[] = {
 { 127956 }, /* CPUs_utilized\000Default\000(software@cpu\\-clock\\,name\\=cpu\\-clock@ if #target_cpu else software@task\\-clock\\,name\\=task\\-clock@) / (duration_time * 1e9)\000\000Average CPU utilization\000\0001CPUs\000\000\000\000011 */
-{ 129273 }, /* backend_cycles_idle\000Default\000stalled\\-cycles\\-backend / cpu\\-cycles\000backend_cycles_idle > 0.2\000Backend stalls per cycle\000\000\000\000\000\000001 */
-{ 129575 }, /* branch_frequency\000Default\000branches / (software@cpu\\-clock\\,name\\=cpu\\-clock@ if #target_cpu else software@task\\-clock\\,name\\=task\\-clock@)\000\000Branches per CPU second\000\0001000M/sec\000\000\000\000011 */
-{ 129755 }, /* branch_miss_rate\000Default\000branch\\-misses / branches\000branch_miss_rate > 0.05\000Branch miss rate\000\000100%\000\000\000\000001 */
+{ 129583 }, /* backend_cycles_idle\000Default\000(stalled\\-cycles\\-backend / cpu\\-cycles if has_event(stalled\\-cycles\\-backend) else 0)\000backend_cycles_idle > 0.2\000Backend stalls per cycle\000\000\000\000\000\000001 */
+{ 129933 }, /* branch_frequency\000Default\000branches / (software@cpu\\-clock\\,name\\=cpu\\-clock@ if #target_cpu else software@task\\-clock\\,name\\=task\\-clock@)\000\000Branches per CPU second\000\0001000M/sec\000\000\000\000011 */
+{ 130113 }, /* branch_miss_rate\000Default\000branch\\-misses / branches\000branch_miss_rate > 0.05\000Branch miss rate\000\000100%\000\000\000\000001 */
 { 128142 }, /* cs_per_second\000Default\000software@context\\-switches\\,name\\=context\\-switches@ * 1e9 / (software@cpu\\-clock\\,name\\=cpu\\-clock@ if #target_cpu else software@task\\-clock\\,name\\=task\\-clock@)\000\000Context switches per CPU second\000\0001cs/sec\000\000\000\000011 */
-{ 129399 }, /* cycles_frequency\000Default\000cpu\\-cycles / (software@cpu\\-clock\\,name\\=cpu\\-clock@ if #target_cpu else software@task\\-clock\\,name\\=task\\-clock@)\000\000Cycles per CPU second\000\0001GHz\000\000\000\000011 */
-{ 130191 }, /* dtlb_miss_rate\000Default3\000dTLB\\-load\\-misses / dTLB\\-loads\000dtlb_miss_rate > 0.05\000dTLB miss rate\000\000100%\000\000\000\000001 */
-{ 129143 }, /* frontend_cycles_idle\000Default\000stalled\\-cycles\\-frontend / cpu\\-cycles\000frontend_cycles_idle > 0.1\000Frontend stalls per cycle\000\000\000\000\000\000001 */
+{ 129757 }, /* cycles_frequency\000Default\000cpu\\-cycles / (software@cpu\\-clock\\,name\\=cpu\\-clock@ if #target_cpu else software@task\\-clock\\,name\\=task\\-clock@)\000\000Cycles per CPU second\000\0001GHz\000\000\000\000011 */
+{ 130549 }, /* dtlb_miss_rate\000Default3\000dTLB\\-load\\-misses / dTLB\\-loads\000dtlb_miss_rate > 0.05\000dTLB miss rate\000\000100%\000\000\000\000001 */
+{ 129404 }, /* frontend_cycles_idle\000Default\000(stalled\\-cycles\\-frontend / cpu\\-cycles if has_event(stalled\\-cycles\\-frontend) else 0)\000frontend_cycles_idle > 0.1\000Frontend stalls per cycle\000\000\000\000\000\000001 */
 { 128866 }, /* insn_per_cycle\000Default\000instructions / cpu\\-cycles\000insn_per_cycle < 1\000Instructions Per Cycle\000\0001instructions\000\000\000\000001 */
-{ 130297 }, /* itlb_miss_rate\000Default3\000iTLB\\-load\\-misses / iTLB\\-loads\000itlb_miss_rate > 0.05\000iTLB miss rate\000\000100%\000\000\000\000001 */
-{ 130403 }, /* l1_prefetch_miss_rate\000Default4\000L1\\-dcache\\-prefetch\\-misses / L1\\-dcache\\-prefetches\000l1_prefetch_miss_rate > 0.05\000L1 prefetch miss rate\000\000100%\000\000\000\000001 */
-{ 129859 }, /* l1d_miss_rate\000Default2\000L1\\-dcache\\-load\\-misses / L1\\-dcache\\-loads\000l1d_miss_rate > 0.05\000L1D  miss rate\000\000100%\000\000\000\000001 */
-{ 130076 }, /* l1i_miss_rate\000Default3\000L1\\-icache\\-load\\-misses / L1\\-icache\\-loads\000l1i_miss_rate > 0.05\000L1I miss rate\000\000100%\000\000\000\000001 */
-{ 129975 }, /* llc_miss_rate\000Default2\000LLC\\-load\\-misses / LLC\\-loads\000llc_miss_rate > 0.05\000LLC miss rate\000\000100%\000\000\000\000001 */
+{ 130655 }, /* itlb_miss_rate\000Default3\000iTLB\\-load\\-misses / iTLB\\-loads\000itlb_miss_rate > 0.05\000iTLB miss rate\000\000100%\000\000\000\000001 */
+{ 130761 }, /* l1_prefetch_miss_rate\000Default4\000L1\\-dcache\\-prefetch\\-misses / L1\\-dcache\\-prefetches\000l1_prefetch_miss_rate > 0.05\000L1 prefetch miss rate\000\000100%\000\000\000\000001 */
+{ 130217 }, /* l1d_miss_rate\000Default2\000L1\\-dcache\\-load\\-misses / L1\\-dcache\\-loads\000l1d_miss_rate > 0.05\000L1D  miss rate\000\000100%\000\000\000\000001 */
+{ 130434 }, /* l1i_miss_rate\000Default3\000L1\\-icache\\-load\\-misses / L1\\-icache\\-loads\000l1i_miss_rate > 0.05\000L1I miss rate\000\000100%\000\000\000\000001 */
+{ 130333 }, /* llc_miss_rate\000Default2\000LLC\\-load\\-misses / LLC\\-loads\000llc_miss_rate > 0.05\000LLC miss rate\000\000100%\000\000\000\000001 */
 { 128375 }, /* migrations_per_second\000Default\000software@cpu\\-migrations\\,name\\=cpu\\-migrations@ * 1e9 / (software@cpu\\-clock\\,name\\=cpu\\-clock@ if #target_cpu else software@task\\-clock\\,name\\=task\\-clock@)\000\000Process migrations to a new CPU per CPU second\000\0001migrations/sec\000\000\000\000011 */
 { 128635 }, /* page_faults_per_second\000Default\000software@page\\-faults\\,name\\=page\\-faults@ * 1e9 / (software@cpu\\-clock\\,name\\=cpu\\-clock@ if #target_cpu else software@task\\-clock\\,name\\=task\\-clock@)\000\000Page faults per CPU second\000\0001faults/sec\000\000\000\000011 */
-{ 128979 }, /* stalled_cycles_per_instruction\000Default\000max(stalled\\-cycles\\-frontend, stalled\\-cycles\\-backend) / instructions\000\000Max front or backend stalls per instruction\000\000\000\000\000\000001 */
+{ 128979 }, /* stalled_cycles_per_instruction\000Default\000(max(stalled\\-cycles\\-frontend, stalled\\-cycles\\-backend) / instructions if has_event(stalled\\-cycles\\-frontend) & has_event(stalled\\-cycles\\-backend) else (stalled\\-cycles\\-frontend / instructions if has_event(stalled\\-cycles\\-frontend) else (stalled\\-cycles\\-backend / instructions if has_event(stalled\\-cycles\\-backend) else 0)))\000\000Max front or backend stalls per instruction\000\000\000\000\000\000001 */
 
 };
 
@@ -2714,21 +2714,21 @@ static const struct pmu_table_entry pmu_events__test_soc_cpu[] = {
 };
 
 static const struct compact_pmu_event pmu_metrics__test_soc_cpu_default_core[] = {
-{ 130551 }, /* CPI\000\0001 / IPC\000\000\000\000\000\000\000\000000 */
-{ 131240 }, /* DCache_L2_All\000\000DCache_L2_All_Hits + DCache_L2_All_Miss\000\000\000\000\000\000\000\000000 */
-{ 131010 }, /* DCache_L2_All_Hits\000\000l2_rqsts.demand_data_rd_hit + l2_rqsts.pf_hit + l2_rqsts.rfo_hit\000\000\000\000\000\000\000\000000 */
-{ 131105 }, /* DCache_L2_All_Miss\000\000max(l2_rqsts.all_demand_data_rd - l2_rqsts.demand_data_rd_hit, 0) + l2_rqsts.pf_miss + l2_rqsts.rfo_miss\000\000\000\000\000\000\000\000000 */
-{ 131305 }, /* DCache_L2_Hits\000\000d_ratio(DCache_L2_All_Hits, DCache_L2_All)\000\000\000\000\000\000\000\000000 */
-{ 131374 }, /* DCache_L2_Misses\000\000d_ratio(DCache_L2_All_Miss, DCache_L2_All)\000\000\000\000\000\000\000\000000 */
-{ 130638 }, /* Frontend_Bound_SMT\000\000idq_uops_not_delivered.core / (4 * (cpu_clk_unhalted.thread / 2 * (1 + cpu_clk_unhalted.one_thread_active / cpu_clk_unhalted.ref_xclk)))\000\000\000\000\000\000\000\000000 */
-{ 130574 }, /* IPC\000group1\000inst_retired.any / cpu_clk_unhalted.thread\000\000\000\000\000\000\000\000000 */
-{ 131512 }, /* L1D_Cache_Fill_BW\000\00064 * l1d.replacement / 1e9 / duration_time\000\000\000\000\000\000\000\000000 */
-{ 131445 }, /* M1\000\000ipc + M2\000\000\000\000\000\000\000\000000 */
-{ 131468 }, /* M2\000\000ipc + M1\000\000\000\000\000\000\000\000000 */
-{ 131491 }, /* M3\000\0001 / M3\000\000\000\000\000\000\000\000000 */
-{ 130938 }, /* cache_miss_cycles\000group1\000dcache_miss_cpi + icache_miss_cycles\000\000\000\000\000\000\000\000000 */
-{ 130805 }, /* dcache_miss_cpi\000\000l1d\\-loads\\-misses / inst_retired.any\000\000\000\000\000\000\000\000000 */
-{ 130870 }, /* icache_miss_cycles\000\000l1i\\-loads\\-misses / inst_retired.any\000\000\000\000\000\000\000\000000 */
+{ 130909 }, /* CPI\000\0001 / IPC\000\000\000\000\000\000\000\000000 */
+{ 131598 }, /* DCache_L2_All\000\000DCache_L2_All_Hits + DCache_L2_All_Miss\000\000\000\000\000\000\000\000000 */
+{ 131368 }, /* DCache_L2_All_Hits\000\000l2_rqsts.demand_data_rd_hit + l2_rqsts.pf_hit + l2_rqsts.rfo_hit\000\000\000\000\000\000\000\000000 */
+{ 131463 }, /* DCache_L2_All_Miss\000\000max(l2_rqsts.all_demand_data_rd - l2_rqsts.demand_data_rd_hit, 0) + l2_rqsts.pf_miss + l2_rqsts.rfo_miss\000\000\000\000\000\000\000\000000 */
+{ 131663 }, /* DCache_L2_Hits\000\000d_ratio(DCache_L2_All_Hits, DCache_L2_All)\000\000\000\000\000\000\000\000000 */
+{ 131732 }, /* DCache_L2_Misses\000\000d_ratio(DCache_L2_All_Miss, DCache_L2_All)\000\000\000\000\000\000\000\000000 */
+{ 130996 }, /* Frontend_Bound_SMT\000\000idq_uops_not_delivered.core / (4 * (cpu_clk_unhalted.thread / 2 * (1 + cpu_clk_unhalted.one_thread_active / cpu_clk_unhalted.ref_xclk)))\000\000\000\000\000\000\000\000000 */
+{ 130932 }, /* IPC\000group1\000inst_retired.any / cpu_clk_unhalted.thread\000\000\000\000\000\000\000\000000 */
+{ 131870 }, /* L1D_Cache_Fill_BW\000\00064 * l1d.replacement / 1e9 / duration_time\000\000\000\000\000\000\000\000000 */
+{ 131803 }, /* M1\000\000ipc + M2\000\000\000\000\000\000\000\000000 */
+{ 131826 }, /* M2\000\000ipc + M1\000\000\000\000\000\000\000\000000 */
+{ 131849 }, /* M3\000\0001 / M3\000\000\000\000\000\000\000\000000 */
+{ 131296 }, /* cache_miss_cycles\000group1\000dcache_miss_cpi + icache_miss_cycles\000\000\000\000\000\000\000\000000 */
+{ 131163 }, /* dcache_miss_cpi\000\000l1d\\-loads\\-misses / inst_retired.any\000\000\000\000\000\000\000\000000 */
+{ 131228 }, /* icache_miss_cycles\000\000l1i\\-loads\\-misses / inst_retired.any\000\000\000\000\000\000\000\000000 */
 
 };
 

From 7f5b8d5e6dde6d5019d03a46c02a6281a4d76a22 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Sun, 5 Apr 2026 22:18:16 -0700
Subject: [PATCH 085/131] perf sched: Avoid crash for unexpected perf sched
 stats report

Doing a `perf sched record` then `perf sched stats report` crashes as
the tp_handler isn't set.  Add a dummy tp_handler for it rather than
adding an extra check.

Reported-by: Ian Rogers <irogers@google.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/builtin-sched.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index d083e2bb7703..9fb5447f9014 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -4955,6 +4955,7 @@ int cmd_sched(int argc, const char **argv)
 		.switch_event	    = replay_switch_event,
 		.fork_event	    = replay_fork_event,
 	};
+	struct trace_sched_handler stats_ops  = {};
 	int ret;
 
 	perf_tool__init(&sched.tool, /*ordered_events=*/true);
@@ -5037,6 +5038,7 @@ int cmd_sched(int argc, const char **argv)
 	} else if (!strcmp(argv[0], "stats")) {
 		const char *const stats_subcommands[] = {"record", "report", NULL};
 
+		sched.tp_handler = &stats_ops;
 		argc = parse_options_subcommand(argc, argv, stats_options,
 						stats_subcommands,
 						stats_usage,

From f1d78f5c9bd4dfda5f12372a4b99e413272723d2 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 20 Mar 2026 23:14:48 -0700
Subject: [PATCH 086/131] perf tests sched stats: Write output to temp file

Writing to the perf.data file can fail in various contexts such as
continual test. Other tests write to a mktemp-ed file, make the "perf
sched stats tests" follow this convention.

Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Swapnil Sapkal <swapnil.sapkal@amd.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/tests/shell/perf_sched_stats.sh | 37 ++++++++++++++++------
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/tools/perf/tests/shell/perf_sched_stats.sh b/tools/perf/tests/shell/perf_sched_stats.sh
index 2b1410b050d0..bef7714ef37a 100755
--- a/tools/perf/tests/shell/perf_sched_stats.sh
+++ b/tools/perf/tests/shell/perf_sched_stats.sh
@@ -4,10 +4,29 @@
 
 set -e
 
+perfdata=$(mktemp /tmp/__perf_test_sched_stats.perf.data.XXXXX)
+perfdata2=$(mktemp /tmp/__perf_test_sched_stats.perf.data.XXXXX)
+
+cleanup() {
+  rm -f "${perfdata}"
+  rm -f "${perfdata}".old
+  rm -f "${perfdata2}"
+  rm -f "${perfdata2}".old
+
+  trap - EXIT TERM INT
+}
+
+trap_cleanup() {
+  echo "Unexpected signal in ${FUNCNAME[1]}"
+  cleanup
+  exit 1
+}
+trap trap_cleanup EXIT TERM INT
+
 err=0
 test_perf_sched_stats_record() {
   echo "Basic perf sched stats record test"
-  if ! perf sched stats record true 2>&1 | \
+  if ! perf sched stats record -o "${perfdata}" true 2>&1 | \
     grep -E -q "[ perf sched stats: Wrote samples to perf.data ]"
   then
     echo "Basic perf sched stats record test [Failed]"
@@ -19,15 +38,13 @@ test_perf_sched_stats_record() {
 
 test_perf_sched_stats_report() {
   echo "Basic perf sched stats report test"
-  perf sched stats record true > /dev/null
-  if ! perf sched stats report 2>&1 | grep -E -q "Description"
+  perf sched stats record -o "${perfdata}" true > /dev/null
+  if ! perf sched stats report -i "${perfdata}" 2>&1 | grep -E -q "Description"
   then
     echo "Basic perf sched stats report test [Failed]"
     err=1
-    rm perf.data
     return
   fi
-  rm perf.data
   echo "Basic perf sched stats report test [Success]"
 }
 
@@ -44,16 +61,14 @@ test_perf_sched_stats_live() {
 
 test_perf_sched_stats_diff() {
   echo "Basic perf sched stats diff test"
-  perf sched stats record true > /dev/null
-  perf sched stats record true > /dev/null
-  if ! perf sched stats diff > /dev/null
+  perf sched stats record -o "${perfdata}" true > /dev/null
+  perf sched stats record -o "${perfdata2}" true > /dev/null
+  if ! perf sched stats diff "${perfdata}" "${perfdata2}" > /dev/null
   then
     echo "Basic perf sched stats diff test [Failed]"
     err=1
-    rm perf.data.old perf.data
     return
   fi
-  rm perf.data.old perf.data
   echo "Basic perf sched stats diff test [Success]"
 }
 
@@ -61,4 +76,6 @@ test_perf_sched_stats_record
 test_perf_sched_stats_report
 test_perf_sched_stats_live
 test_perf_sched_stats_diff
+
+cleanup
 exit $err

From c66cf8c593c7603415415587077f8de93238544f Mon Sep 17 00:00:00 2001
From: Ricky Ringler <ricky.ringler@proton.me>
Date: Sat, 4 Apr 2026 01:16:56 +0000
Subject: [PATCH 087/131] perf tools: Save cln_size header

Store cacheline size during perf record in header, so that cacheline
size can be used for other features, like sort keys for perf report.

Testing example with feat enabled:

  $ perf record ./Example

  $ perf report --header-only | grep -C 3 cacheline
  CPU_DOMAIN_INFO info available, use -I to display
  e_machine : 62
  e_flags : 0
  cacheline size: 64
  missing features: TRACING_DATA BUILD_ID BRANCH_STACK GROUP_DESC AUXTRACE \
  STAT CLOCKID DIR_FORMAT COMPRESSED CLOCK_DATA
  ========

[namhyung: Update the commit message and remove blank lines]
Signed-off-by: Ricky Ringler <ricky.ringler@proton.me>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/builtin-inject.c |  1 +
 tools/perf/util/env.h       |  1 +
 tools/perf/util/header.c    | 30 ++++++++++++++++++++++++++++++
 tools/perf/util/header.h    |  3 +++
 tools/perf/util/sort.c      | 37 ++++++++++++++++++++++++++-----------
 5 files changed, 61 insertions(+), 11 deletions(-)

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 5b29f4296861..11ac7c8c4be3 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -2134,6 +2134,7 @@ static bool keep_feat(struct perf_inject *inject, int feat)
 	case HEADER_HYBRID_TOPOLOGY:
 	case HEADER_PMU_CAPS:
 	case HEADER_CPU_DOMAIN_INFO:
+	case HEADER_CLN_SIZE:
 		return true;
 	/* Information that can be updated */
 	case HEADER_BUILD_ID:
diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h
index a4501cbca375..c7052ac1f856 100644
--- a/tools/perf/util/env.h
+++ b/tools/perf/util/env.h
@@ -112,6 +112,7 @@ struct perf_env {
 	struct cpu_cache_level	*caches;
 	struct cpu_domain_map	**cpu_domain;
 	int			 caches_cnt;
+	unsigned int		cln_size;
 	u32			comp_ratio;
 	u32			comp_ver;
 	u32			comp_type;
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index ad7d09a481bb..a3b7b796639b 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -54,6 +54,7 @@
 #include "bpf-event.h"
 #include "bpf-utils.h"
 #include "clockid.h"
+#include "cacheline.h"
 
 #include <linux/ctype.h>
 #include <internal/lib.h>
@@ -1315,6 +1316,19 @@ out:
 	return ret;
 }
 
+static int write_cln_size(struct feat_fd *ff,
+		       struct evlist *evlist __maybe_unused)
+{
+	int cln_size = cacheline_size();
+
+	if (!cln_size)
+		cln_size = DEFAULT_CACHELINE_SIZE;
+
+	ff->ph->env.cln_size = cln_size;
+
+	return do_write(ff, &cln_size, sizeof(cln_size));
+}
+
 static int write_stat(struct feat_fd *ff __maybe_unused,
 		      struct evlist *evlist __maybe_unused)
 {
@@ -2278,6 +2292,11 @@ static void print_cache(struct feat_fd *ff, FILE *fp __maybe_unused)
 	}
 }
 
+static void print_cln_size(struct feat_fd *ff, FILE *fp)
+{
+	fprintf(fp, "# cacheline size: %u\n", ff->ph->env.cln_size);
+}
+
 static void print_compressed(struct feat_fd *ff, FILE *fp)
 {
 	fprintf(fp, "# compressed : %s, level = %d, ratio = %d\n",
@@ -3184,6 +3203,16 @@ out_free_caches:
 	return -1;
 }
 
+static int process_cln_size(struct feat_fd *ff, void *data __maybe_unused)
+{
+	struct perf_env *env = &ff->ph->env;
+
+	if (do_read_u32(ff, &env->cln_size))
+		return -1;
+
+	return 0;
+}
+
 static int process_sample_time(struct feat_fd *ff, void *data __maybe_unused)
 {
 	struct perf_session *session;
@@ -3797,6 +3826,7 @@ const struct perf_header_feature_ops feat_ops[HEADER_LAST_FEATURE] = {
 	FEAT_OPR(PMU_CAPS,	pmu_caps,	false),
 	FEAT_OPR(CPU_DOMAIN_INFO,	cpu_domain_info,	true),
 	FEAT_OPR(E_MACHINE,	e_machine,	false),
+	FEAT_OPR(CLN_SIZE,	cln_size,	false),
 };
 
 struct header_print_data {
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 41ce663d93ff..86b1a72026d3 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -55,6 +55,7 @@ enum {
 	HEADER_PMU_CAPS,
 	HEADER_CPU_DOMAIN_INFO,
 	HEADER_E_MACHINE,
+	HEADER_CLN_SIZE,
 	HEADER_LAST_FEATURE,
 	HEADER_FEAT_BITS	= 256,
 };
@@ -206,6 +207,8 @@ int write_padded(struct feat_fd *fd, const void *bf,
 
 int build_caches_for_cpu(u32 cpu, struct cpu_cache_level caches[], u32 *cntp);
 
+#define DEFAULT_CACHELINE_SIZE 64
+
 /*
  * arch specific callback
  */
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index fda8fcfa46e0..5c9656cc4f9d 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -31,6 +31,7 @@
 #include "time-utils.h"
 #include "cgroup.h"
 #include "machine.h"
+#include "session.h"
 #include "trace-event.h"
 #include <linux/kernel.h>
 #include <linux/string.h>
@@ -2584,7 +2585,26 @@ struct sort_entry sort_type_offset = {
 
 /* --sort typecln */
 
-#define DEFAULT_CACHELINE_SIZE 64
+static int
+hist_entry__cln_size(struct hist_entry *he)
+{
+	int ret = 0;
+
+	if (he && he->hists) {
+		struct evsel *evsel = hists_to_evsel(he->hists);
+
+		if (evsel) {
+			struct perf_session *session = evsel__session(evsel);
+
+			ret = session->header.env.cln_size;
+		}
+	}
+
+	if (ret < 1)
+		ret = DEFAULT_CACHELINE_SIZE; // avoid div/0 later
+
+	return ret;
+}
 
 static int64_t
 sort__typecln_sort(struct hist_entry *left, struct hist_entry *right)
@@ -2592,11 +2612,9 @@ sort__typecln_sort(struct hist_entry *left, struct hist_entry *right)
 	struct annotated_data_type *left_type = left->mem_type;
 	struct annotated_data_type *right_type = right->mem_type;
 	int64_t left_cln, right_cln;
+	int64_t cln_size_left = hist_entry__cln_size(left);
+	int64_t cln_size_right = hist_entry__cln_size(right);
 	int64_t ret;
-	int cln_size = cacheline_size();
-
-	if (cln_size == 0)
-		cln_size = DEFAULT_CACHELINE_SIZE;
 
 	if (!left_type) {
 		sort__type_init(left);
@@ -2612,8 +2630,8 @@ sort__typecln_sort(struct hist_entry *left, struct hist_entry *right)
 	if (ret)
 		return ret;
 
-	left_cln = left->mem_type_off / cln_size;
-	right_cln = right->mem_type_off / cln_size;
+	left_cln = left->mem_type_off / cln_size_left;
+	right_cln = right->mem_type_off / cln_size_right;
 	return left_cln - right_cln;
 }
 
@@ -2621,10 +2639,7 @@ static int hist_entry__typecln_snprintf(struct hist_entry *he, char *bf,
 				     size_t size, unsigned int width __maybe_unused)
 {
 	struct annotated_data_type *he_type = he->mem_type;
-	int cln_size = cacheline_size();
-
-	if (cln_size == 0)
-		cln_size = DEFAULT_CACHELINE_SIZE;
+	int cln_size = hist_entry__cln_size(he);
 
 	return repsep_snprintf(bf, size, "%s: cache-line %d", he_type->self.type_name,
 			       he->mem_type_off / cln_size);

From 8a7a23b27d55e036c2c54438d75878cf24bf95f6 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 3 Apr 2026 20:43:01 -0700
Subject: [PATCH 088/131] perf sample: Document struct perf_sample

Add kernel-doc for struct perf_sample capturing the somewhat unusual
population of fields and lifetime relationships.

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/sample.h | 109 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 105 insertions(+), 4 deletions(-)

diff --git a/tools/perf/util/sample.h b/tools/perf/util/sample.h
index 3cce8dd202aa..9febad1c8207 100644
--- a/tools/perf/util/sample.h
+++ b/tools/perf/util/sample.h
@@ -81,47 +81,148 @@ struct simd_flags {
 #define SIMD_OP_FLAGS_PRED_PARTIAL	0x01	/* partial predicate */
 #define SIMD_OP_FLAGS_PRED_EMPTY	0x02	/* empty predicate */
 
+/**
+ * struct perf_sample
+ *
+ * A sample is generally filled in by evlist__parse_sample/evsel__parse_sample
+ * which fills in the variables from a "union perf_event *event" which is data
+ * from a perf ring buffer or perf.data file. The "event" sample is variable in
+ * length as determined by the perf_event_attr (in the evsel) and details within
+ * the sample event itself. A struct perf_sample avoids needing to care about
+ * the variable length nature of the original event.
+ *
+ * To avoid being excessively large parts of the struct perf_sample are pointers
+ * into the original sample event. In general the lifetime of a struct
+ * perf_sample needs to be less than the "union perf_event *event" it was
+ * derived from.
+ *
+ * The struct regs_dump user_regs and intr_regs are lazily allocated again for
+ * size reasons, due to them holding a cache of looked up registers. The
+ * function pair of perf_sample__init and perf_sample__exit correctly initialize
+ * and clean up these values.
+ */
 struct perf_sample {
+	/** @ip: The sample event PERF_SAMPLE_IP value. */
 	u64 ip;
-	u32 pid, tid;
+	/** @pid: The sample event PERF_SAMPLE_TID pid value. */
+	u32 pid;
+	/** @tid: The sample event PERF_SAMPLE_TID tid value. */
+	u32 tid;
+	/** @time: The sample event PERF_SAMPLE_TIME value. */
 	u64 time;
+	/** @addr: The sample event PERF_SAMPLE_ADDR value. */
 	u64 addr;
+	/** @id: The sample event PERF_SAMPLE_ID or PEF_SAMPLE_IDENTIFIER value. */
 	u64 id;
+	/** @stream_id: The sample event PERF_SAMPLE_STREAM_ID value. */
 	u64 stream_id;
+	/** @period: The sample event PERF_SAMPLE_PERIOD value. */
 	u64 period;
+	/** @weight: Data determined by PERF_SAMPLE_WEIGHT or PERF_SAMPLE_WEIGHT_STRUCT. */
 	u64 weight;
+	/** @transaction: The sample event PERF_SAMPLE_TRANSACTION value. */
 	u64 transaction;
+	/** @insn_cnt: Filled in and used by intel-pt. */
 	u64 insn_cnt;
+	/** @cyc_cnt: Filled in and used by intel-pt. */
 	u64 cyc_cnt;
+	/** @cpu: The sample event PERF_SAMPLE_CPU value. */
 	u32 cpu;
+	/**
+	 * @raw_size: The size in bytes of raw data from PERF_SAMPLE_RAW. For
+	 *            alignment reasons this should always be sizeof(u32)
+	 *            followed by a multiple of sizeof(u64).
+	 */
 	u32 raw_size;
+	/** @data_src: The sample event PERF_SAMPLE_DATA_SRC value. */
 	u64 data_src;
+	/** @phys_addr: The sample event PERF_SAMPLE_PHYS_ADDR value. */
 	u64 phys_addr;
+	/** @data_page_size: The sample event PERF_SAMPLE_DATA_PAGE_SIZE value. */
 	u64 data_page_size;
+	/** @code_page_size: The sample event PERF_SAMPLE_CODE_PAGE_SIZE value. */
 	u64 code_page_size;
+	/** @cgroup: The sample event PERF_SAMPLE_CGROUP value. */
 	u64 cgroup;
+	/** @flags: Extra flag data from auxiliary events like intel-pt. */
 	u32 flags;
+	/** @machine_pid: The guest machine pid derived from the sample id. */
 	u32 machine_pid;
+	/** @vcpu: The guest machine vcpu derived from the sample id. */
 	u32 vcpu;
+	/**
+	 * @insn_len: Instruction length from auxiliary events like
+	 *            intel-pt. The instruction itself is held in insn.
+	 */
 	u16 insn_len;
+	/**
+	 * @cpumode: The cpumode from struct perf_event_header misc variable
+	 *           masked with CPUMODE_MASK. Gives user, kernel and hypervisor
+	 *           information.
+	 */
 	u8  cpumode;
+	/** @misc: The entire struct perf_event_header misc variable. */
 	u16 misc;
+	/**
+	 * @ins_lat: Instruction latency information from weight2 in
+	 *           PERF_SAMPLE_WEIGHT_STRUCT or auxiliary events like
+	 *           intel-pt.
+	 */
 	u16 ins_lat;
-	/** @weight3: On x86 holds retire_lat, on powerpc holds p_stage_cyc. */
+	/**
+	 * @weight3: From PERF_SAMPLE_WEIGHT_STRUCT. On x86 holds retire_lat, on
+	 *           powerpc holds p_stage_cyc.
+	 */
 	u16 weight3;
-	bool no_hw_idx;		/* No hw_idx collected in branch_stack */
-	bool deferred_callchain;	/* Has deferred user callchains */
+	/**
+	 * @no_hw_idx: For PERF_SAMPLE_BRANCH_STACK, true when
+	 *             PERF_SAMPLE_BRANCH_HW_INDEX isn't set.
+	 */
+	bool no_hw_idx;
+	/**
+	 * @deferred_callchain: When processing PERF_SAMPLE_CALLCHAIN a deferred
+	 *                      user callchain marker was encountered.
+	 */
+	bool deferred_callchain;
+	/**
+	 * @deferred_cookie: Identifier of the deferred callchain in the later
+	 *                   PERF_RECORD_CALLCHAIN_DEFERRED event.
+	 */
 	u64 deferred_cookie;
+	/** @insn: A copy of the sampled instruction filled in by perf_sample__fetch_insn. */
 	char insn[MAX_INSN];
+	/** @raw_data: Pointer into the original event for PERF_SAMPLE_RAW data. */
 	void *raw_data;
+	/**
+	 * @callchain: Pointer into the original event for PERF_SAMPLE_CALLCHAIN
+	 *             data. For deferred callchains this may be a copy that
+	 *             needs freeing, see sample__merge_deferred_callchain.
+	 */
 	struct ip_callchain *callchain;
+	/** @branch_stack: Pointer into the original event for PERF_SAMPLE_BRANCH_STACK data. */
 	struct branch_stack *branch_stack;
+	/**
+	 * @branch_stack_cntr: Pointer into the original event for
+	 *                     PERF_SAMPLE_BRANCH_COUNTERS data.
+	 */
 	u64 *branch_stack_cntr;
+	/** @user_regs: Values and pointers into the sample for PERF_SAMPLE_REGS_USER. */
 	struct regs_dump  *user_regs;
+	/** @intr_regs: Values and pointers into the sample for PERF_SAMPLE_REGS_INTR. */
 	struct regs_dump  *intr_regs;
+	/** @user_stack: Size and pointer into the sample for PERF_SAMPLE_STACK_USER. */
 	struct stack_dump user_stack;
+	/**
+	 * @read: The sample event PERF_SAMPLE_READ counter values. The valid
+	 *        values depend on the attr.read_format PERF_FORMAT_ values.
+	 */
 	struct sample_read read;
+	/**
+	 * @aux_sample: Similar to raw data but with a 64-bit size and
+	 *              alignment, PERF_SAMPLE_AUX data.
+	 */
 	struct aux_sample aux_sample;
+	/** @simd_flags: SIMD flag information from ARM SPE auxiliary events. */
 	struct simd_flags simd_flags;
 };
 

From ad5ceacd48e9ea36bd12e778071561290adb0154 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 3 Apr 2026 20:43:02 -0700
Subject: [PATCH 089/131] perf sample: Make sure perf_sample__init/exit are
 used

The deferred stack trace code wasn't using perf_sample__init/exit. Add
the deferred stack trace clean up to perf_sample__exit which requires
proper NULL initialization in perf_sample__init. Make the
perf_sample__exit robust to being called more than once by using
zfree. Make the error paths in evsel__parse_sample exit the
sample. Add a merged_callchain boolean to capture that callchain is
allocated, deferred_callchain doen't suffice for this. Pack the struct
variables to avoid padding bytes for this.

Similiarly powerpc_vpadtl_sample wasn't using perf_sample__init/exit,
use it for consistency and potential issues with uninitialized
variables.

Similarly guest_session__inject_events in builtin-inject wasn't using
perf_sample_init/exit. The lifetime management for fetched events is
somewhat complex there, but when an event is fetched the sample should
be initialized and needs exiting on error. The sample may be left in
place so that future injects have access to it.

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/builtin-inject.c        | 55 +++++++++++++++++++++---------
 tools/perf/tests/perf-record.c     |  1 +
 tools/perf/tests/switch-tracking.c |  2 ++
 tools/perf/util/callchain.c        | 10 ++++--
 tools/perf/util/evlist.c           |  5 ++-
 tools/perf/util/evsel.c            | 34 +++++++++++-------
 tools/perf/util/powerpc-vpadtl.c   | 10 +++---
 tools/perf/util/sample.c           | 10 ++++--
 tools/perf/util/sample.h           | 17 +++++----
 tools/perf/util/session.c          | 13 ++++---
 10 files changed, 108 insertions(+), 49 deletions(-)

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 11ac7c8c4be3..952e6f6f3168 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -1087,6 +1087,7 @@ static int perf_inject__sched_stat(const struct perf_tool *tool,
 	struct perf_sample sample_sw;
 	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
 	u32 pid = evsel__intval(evsel, sample, "pid");
+	int ret;
 
 	list_for_each_entry(ent, &inject->samples, node) {
 		if (pid == ent->tid)
@@ -1103,7 +1104,9 @@ found:
 	perf_event__synthesize_sample(event_sw, evsel->core.attr.sample_type,
 				      evsel->core.attr.read_format, &sample_sw);
 	build_id__mark_dso_hit(tool, event_sw, &sample_sw, evsel, machine);
-	return perf_event__repipe(tool, event_sw, &sample_sw, machine);
+	ret = perf_event__repipe(tool, event_sw, &sample_sw, machine);
+	perf_sample__exit(&sample_sw);
+	return ret;
 }
 #endif
 
@@ -1648,6 +1651,7 @@ static int guest_session__fetch(struct guest_session *gs)
 	size_t hdr_sz = sizeof(*hdr);
 	ssize_t ret;
 
+	perf_sample__init(&gs->ev.sample, /*all=*/false);
 	buf = gs->ev.event_buf;
 	if (!buf) {
 		buf = malloc(PERF_SAMPLE_MAX_SIZE);
@@ -1745,18 +1749,24 @@ static int guest_session__inject_events(struct guest_session *gs, u64 timestamp)
 		if (!gs->fetched) {
 			ret = guest_session__fetch(gs);
 			if (ret)
-				return ret;
+				break;
 			gs->fetched = true;
 		}
 
 		ev = gs->ev.event;
 		sample = &gs->ev.sample;
 
-		if (!ev->header.size)
-			return 0; /* EOF */
-
-		if (sample->time > timestamp)
-			return 0;
+		if (!ev->header.size) {
+			/* EOF */
+			perf_sample__exit(&gs->ev.sample);
+			gs->fetched = false;
+			ret = 0;
+			break;
+		}
+		if (sample->time > timestamp) {
+			ret = 0;
+			break;
+		}
 
 		/* Change cpumode to guest */
 		cpumode = ev->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
@@ -1779,12 +1789,14 @@ static int guest_session__inject_events(struct guest_session *gs, u64 timestamp)
 
 		if (id_hdr_size & 7) {
 			pr_err("Bad id_hdr_size %u\n", id_hdr_size);
-			return -EINVAL;
+			ret = -EINVAL;
+			break;
 		}
 
 		if (ev->header.size & 7) {
 			pr_err("Bad event size %u\n", ev->header.size);
-			return -EINVAL;
+			ret = -EINVAL;
+			break;
 		}
 
 		/* Remove guest id sample */
@@ -1792,14 +1804,16 @@ static int guest_session__inject_events(struct guest_session *gs, u64 timestamp)
 
 		if (ev->header.size & 7) {
 			pr_err("Bad raw event size %u\n", ev->header.size);
-			return -EINVAL;
+			ret = -EINVAL;
+			break;
 		}
 
 		guest_id = guest_session__lookup_id(gs, id);
 		if (!guest_id) {
 			pr_err("Guest event with unknown id %llu\n",
 			       (unsigned long long)id);
-			return -EINVAL;
+			ret = -EINVAL;
+			break;
 		}
 
 		/* Change to host ID to avoid conflicting ID values */
@@ -1819,19 +1833,28 @@ static int guest_session__inject_events(struct guest_session *gs, u64 timestamp)
 		/* New id sample with new ID and CPU */
 		ret = evlist__append_id_sample(inject->session->evlist, ev, sample);
 		if (ret)
-			return ret;
+			break;
 
 		if (ev->header.size & 7) {
 			pr_err("Bad new event size %u\n", ev->header.size);
-			return -EINVAL;
+			ret = -EINVAL;
+			break;
 		}
 
-		gs->fetched = false;
-
 		ret = output_bytes(inject, ev, ev->header.size);
 		if (ret)
-			return ret;
+			break;
+
+		/* Reset for next guest session event fetch. */
+		perf_sample__exit(sample);
+		gs->fetched = false;
 	}
+	if (ret && gs->fetched) {
+		/* Clear saved sample state on error. */
+		perf_sample__exit(&gs->ev.sample);
+		gs->fetched = false;
+	}
+	return ret;
 }
 
 static int guest_session__flush_events(struct guest_session *gs)
diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c
index c6e31ab8a6b8..ad44cc68820b 100644
--- a/tools/perf/tests/perf-record.c
+++ b/tools/perf/tests/perf-record.c
@@ -300,6 +300,7 @@ static int test__PERF_RECORD(struct test_suite *test __maybe_unused, int subtest
 				}
 
 				perf_mmap__consume(&md->core);
+				perf_sample__exit(&sample);
 			}
 			perf_mmap__read_done(&md->core);
 		}
diff --git a/tools/perf/tests/switch-tracking.c b/tools/perf/tests/switch-tracking.c
index 15791fcb76b2..72a8289e846d 100644
--- a/tools/perf/tests/switch-tracking.c
+++ b/tools/perf/tests/switch-tracking.c
@@ -239,11 +239,13 @@ static int add_event(struct evlist *evlist, struct list_head *events,
 
 	if (!sample.time) {
 		pr_debug("event with no time\n");
+		perf_sample__exit(&sample);
 		return -1;
 	}
 
 	node->event_time = sample.time;
 
+	perf_sample__exit(&sample);
 	return 0;
 }
 
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index f879b84f8ff9..f031cbbeeba8 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -1901,16 +1901,19 @@ int sample__merge_deferred_callchain(struct perf_sample *sample_orig,
 	u64 nr_deferred = sample_callchain->callchain->nr;
 	struct ip_callchain *callchain;
 
+	if (sample_orig->merged_callchain) {
+		/* Already merged. */
+		return -EINVAL;
+	}
+
 	if (sample_orig->callchain->nr < 2) {
 		sample_orig->deferred_callchain = false;
 		return -EINVAL;
 	}
 
 	callchain = calloc(1 + nr_orig + nr_deferred, sizeof(u64));
-	if (callchain == NULL) {
-		sample_orig->deferred_callchain = false;
+	if (callchain == NULL)
 		return -ENOMEM;
-	}
 
 	callchain->nr = nr_orig + nr_deferred;
 	/* copy original including PERF_CONTEXT_USER_DEFERRED (but the cookie) */
@@ -1919,6 +1922,7 @@ int sample__merge_deferred_callchain(struct perf_sample *sample_orig,
 	memcpy(&callchain->ips[nr_orig], sample_callchain->callchain->ips,
 	       nr_deferred * sizeof(u64));
 
+	sample_orig->merged_callchain = true;
 	sample_orig->callchain = callchain;
 	return 0;
 }
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index c702741a9173..f46e1d40bad7 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1632,8 +1632,11 @@ int evlist__parse_sample(struct evlist *evlist, union perf_event *event, struct
 	struct evsel *evsel = evlist__event2evsel(evlist, event);
 	int ret;
 
-	if (!evsel)
+	if (!evsel) {
+		/* Ensure the sample is okay for perf_sample__exit. */
+		perf_sample__init(sample, /*all=*/false);
 		return -EFAULT;
+	}
 	ret = evsel__parse_sample(evsel, event, sample);
 	if (ret)
 		return ret;
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 1281af056cec..df2392713edb 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -3073,7 +3073,7 @@ static inline bool overflow(const void *endp, u16 max_size, const void *offset,
 #define OVERFLOW_CHECK(offset, size, max_size)				\
 	do {								\
 		if (overflow(endp, (max_size), (offset), (size)))	\
-			return -EFAULT;					\
+			goto out_efault;				\
 	} while (0)
 
 #define OVERFLOW_CHECK_u64(offset) \
@@ -3205,6 +3205,8 @@ static int __set_offcpu_sample(struct perf_sample *data)
 	data->cgroup = *array;
 
 	return 0;
+out_efault:
+	return -EFAULT;
 }
 
 int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
@@ -3223,7 +3225,7 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
 	 */
 	union u64_swap u;
 
-	memset(data, 0, sizeof(*data));
+	perf_sample__init(data, /*all=*/true);
 	data->cpu = data->pid = data->tid = -1;
 	data->stream_id = data->id = data->time = -1ULL;
 	data->period = evsel->core.attr.sample_period;
@@ -3237,25 +3239,26 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
 
 		data->callchain = (struct ip_callchain *)&event->callchain_deferred.nr;
 		if (data->callchain->nr > max_callchain_nr)
-			return -EFAULT;
+			goto out_efault;
 
 		data->deferred_cookie = event->callchain_deferred.cookie;
 
 		if (evsel->core.attr.sample_id_all)
 			perf_evsel__parse_id_sample(evsel, event, data);
+
 		return 0;
 	}
 
 	if (event->header.type != PERF_RECORD_SAMPLE) {
-		if (!evsel->core.attr.sample_id_all)
-			return 0;
-		return perf_evsel__parse_id_sample(evsel, event, data);
+		if (evsel->core.attr.sample_id_all)
+			perf_evsel__parse_id_sample(evsel, event, data);
+		return 0;
 	}
 
 	array = event->sample.array;
 
 	if (perf_event__check_size(event, evsel->sample_size))
-		return -EFAULT;
+		goto out_efault;
 
 	if (type & PERF_SAMPLE_IDENTIFIER) {
 		data->id = *array;
@@ -3348,7 +3351,7 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
 					sizeof(struct sample_read_value);
 
 			if (data->read.group.nr > max_group_nr)
-				return -EFAULT;
+				goto out_efault;
 
 			sz = data->read.group.nr * sample_read_value_size(read_format);
 			OVERFLOW_CHECK(array, sz, max_size);
@@ -3376,7 +3379,7 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
 		data->callchain = (struct ip_callchain *)array++;
 		callchain_nr = data->callchain->nr;
 		if (callchain_nr > max_callchain_nr)
-			return -EFAULT;
+			goto out_efault;
 		sz = callchain_nr * sizeof(u64);
 		/*
 		 * Save the cookie for the deferred user callchain.  The last 2
@@ -3434,7 +3437,7 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
 		data->branch_stack = (struct branch_stack *)array++;
 
 		if (data->branch_stack->nr > max_branch_nr)
-			return -EFAULT;
+			goto out_efault;
 
 		sz = data->branch_stack->nr * sizeof(struct branch_entry);
 		if (evsel__has_branch_hw_idx(evsel)) {
@@ -3511,7 +3514,7 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
 			data->user_stack.size = *array++;
 			if (WARN_ONCE(data->user_stack.size > sz,
 				      "user stack dump failure\n"))
-				return -EFAULT;
+				goto out_efault;
 		}
 	}
 
@@ -3588,10 +3591,15 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
 		array = (void *)array + sz;
 	}
 
-	if (evsel__is_offcpu_event(evsel))
-		return __set_offcpu_sample(data);
+	if (evsel__is_offcpu_event(evsel)) {
+		if (__set_offcpu_sample(data))
+			goto out_efault;
+	}
 
 	return 0;
+out_efault:
+	perf_sample__exit(data);
+	return -EFAULT;
 }
 
 int evsel__parse_sample_timestamp(struct evsel *evsel, union perf_event *event,
diff --git a/tools/perf/util/powerpc-vpadtl.c b/tools/perf/util/powerpc-vpadtl.c
index d1c3396f182f..993ab16614c7 100644
--- a/tools/perf/util/powerpc-vpadtl.c
+++ b/tools/perf/util/powerpc-vpadtl.c
@@ -182,7 +182,9 @@ static int powerpc_vpadtl_sample(struct powerpc_vpadtl_entry *record,
 {
 	struct perf_sample sample;
 	union perf_event event;
+	int ret;
 
+	perf_sample__init(&sample, /*all=*/true);
 	sample.ip = be64_to_cpu(record->srr0);
 	sample.period = 1;
 	sample.cpu = cpu;
@@ -198,12 +200,12 @@ static int powerpc_vpadtl_sample(struct powerpc_vpadtl_entry *record,
 	event.sample.header.misc = sample.cpumode;
 	event.sample.header.size = sizeof(struct perf_event_header);
 
-	if (perf_session__deliver_synth_event(vpa->session, &event, &sample)) {
+	ret = perf_session__deliver_synth_event(vpa->session, &event, &sample);
+	if (ret)
 		pr_debug("Failed to create sample for dtl entry\n");
-		return -1;
-	}
 
-	return 0;
+	perf_sample__exit(&sample);
+	return ret;
 }
 
 static int powerpc_vpadtl_get_buffer(struct powerpc_vpadtl_queue *vpaq)
diff --git a/tools/perf/util/sample.c b/tools/perf/util/sample.c
index 8f82aaf1aab6..2a30de4573f6 100644
--- a/tools/perf/util/sample.c
+++ b/tools/perf/util/sample.c
@@ -21,13 +21,19 @@ void perf_sample__init(struct perf_sample *sample, bool all)
 	} else {
 		sample->user_regs = NULL;
 		sample->intr_regs = NULL;
+		sample->merged_callchain = false;
+		sample->callchain = NULL;
 	}
 }
 
 void perf_sample__exit(struct perf_sample *sample)
 {
-	free(sample->user_regs);
-	free(sample->intr_regs);
+	zfree(&sample->user_regs);
+	zfree(&sample->intr_regs);
+	if (sample->merged_callchain) {
+		zfree(&sample->callchain);
+		sample->merged_callchain = false;
+	}
 }
 
 struct regs_dump *perf_sample__user_regs(struct perf_sample *sample)
diff --git a/tools/perf/util/sample.h b/tools/perf/util/sample.h
index 9febad1c8207..fea7c77ff802 100644
--- a/tools/perf/util/sample.h
+++ b/tools/perf/util/sample.h
@@ -155,12 +155,6 @@ struct perf_sample {
 	 *            intel-pt. The instruction itself is held in insn.
 	 */
 	u16 insn_len;
-	/**
-	 * @cpumode: The cpumode from struct perf_event_header misc variable
-	 *           masked with CPUMODE_MASK. Gives user, kernel and hypervisor
-	 *           information.
-	 */
-	u8  cpumode;
 	/** @misc: The entire struct perf_event_header misc variable. */
 	u16 misc;
 	/**
@@ -174,6 +168,12 @@ struct perf_sample {
 	 *           powerpc holds p_stage_cyc.
 	 */
 	u16 weight3;
+	/**
+	 * @cpumode: The cpumode from struct perf_event_header misc variable
+	 *           masked with CPUMODE_MASK. Gives user, kernel and hypervisor
+	 *           information.
+	 */
+	u8  cpumode;
 	/**
 	 * @no_hw_idx: For PERF_SAMPLE_BRANCH_STACK, true when
 	 *             PERF_SAMPLE_BRANCH_HW_INDEX isn't set.
@@ -184,6 +184,11 @@ struct perf_sample {
 	 *                      user callchain marker was encountered.
 	 */
 	bool deferred_callchain;
+	/**
+	 * @merged_callchain: A synthesized merged callchain that is allocated
+	 *                    and needs freeing.
+	 */
+	bool merged_callchain;
 	/**
 	 * @deferred_cookie: Identifier of the deferred callchain in the later
 	 *                   PERF_RECORD_CALLCHAIN_DEFERRED event.
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 3a911c70cd0e..53fb2e628b71 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1374,14 +1374,18 @@ static int evlist__deliver_deferred_callchain(struct evlist *evlist,
 	list_for_each_entry_safe(de, tmp, &evlist->deferred_samples, list) {
 		struct perf_sample orig_sample;
 
+		perf_sample__init(&orig_sample, /*all=*/false);
 		ret = evlist__parse_sample(evlist, de->event, &orig_sample);
 		if (ret < 0) {
 			pr_err("failed to parse original sample\n");
+			perf_sample__exit(&orig_sample);
 			break;
 		}
 
-		if (sample->tid != orig_sample.tid)
+		if (sample->tid != orig_sample.tid) {
+			perf_sample__exit(&orig_sample);
 			continue;
+		}
 
 		if (event->callchain_deferred.cookie == orig_sample.deferred_cookie)
 			sample__merge_deferred_callchain(&orig_sample, sample);
@@ -1392,9 +1396,7 @@ static int evlist__deliver_deferred_callchain(struct evlist *evlist,
 		ret = evlist__deliver_sample(evlist, tool, de->event,
 					     &orig_sample, evsel, machine);
 
-		if (orig_sample.deferred_callchain)
-			free(orig_sample.callchain);
-
+		perf_sample__exit(&orig_sample);
 		list_del(&de->list);
 		free(de->event);
 		free(de);
@@ -1421,9 +1423,11 @@ static int session__flush_deferred_samples(struct perf_session *session,
 	list_for_each_entry_safe(de, tmp, &evlist->deferred_samples, list) {
 		struct perf_sample sample;
 
+		perf_sample__init(&sample, /*all=*/false);
 		ret = evlist__parse_sample(evlist, de->event, &sample);
 		if (ret < 0) {
 			pr_err("failed to parse original sample\n");
+			perf_sample__exit(&sample);
 			break;
 		}
 
@@ -1431,6 +1435,7 @@ static int session__flush_deferred_samples(struct perf_session *session,
 		ret = evlist__deliver_sample(evlist, tool, de->event,
 					     &sample, evsel, machine);
 
+		perf_sample__exit(&sample);
 		list_del(&de->list);
 		free(de->event);
 		free(de);

From aeae075a0352eb6ab363fb1910f209eaa296a175 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 3 Apr 2026 20:43:03 -0700
Subject: [PATCH 090/131] perf sample: Add evsel to struct perf_sample

Add the evsel from evsel__parse_sample into the struct
perf_sample. Sometimes we want to alter the evsel associated with a
sample, such as with off-cpu bpf-output events. In general the evsel
and perf_sample are passed as a pair, but this makes an altered evsel
something of a chore to keep checking for and setting up. Later
patches will remove passing an evsel with the perf_sample and switch
to just using the perf_sample's value.

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/builtin-inject.c       |  6 +++---
 tools/perf/builtin-script.c       |  4 ++++
 tools/perf/tests/hists_cumulate.c |  2 +-
 tools/perf/tests/hists_filter.c   |  1 +
 tools/perf/tests/hists_output.c   |  2 +-
 tools/perf/util/evsel.c           |  1 +
 tools/perf/util/sample.c          |  1 +
 tools/perf/util/sample.h          |  3 +++
 tools/perf/util/session.c         | 35 +++++++++++++++++++------------
 9 files changed, 37 insertions(+), 18 deletions(-)

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 952e6f6f3168..b4add7a70b22 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -133,7 +133,7 @@ struct perf_inject {
 	struct perf_file_section secs[HEADER_FEAT_BITS];
 	struct guest_session	guest_session;
 	struct strlist		*known_build_ids;
-	const struct evsel	*mmap_evsel;
+	struct evsel		*mmap_evsel;
 	struct ip_callchain	*raw_callchain;
 };
 
@@ -519,7 +519,7 @@ static struct dso *findnew_dso(int pid, int tid, const char *filename,
  * processing mmap events. If not stashed, search the evlist for the first mmap
  * gathering event.
  */
-static const struct evsel *inject__mmap_evsel(struct perf_inject *inject)
+static struct evsel *inject__mmap_evsel(struct perf_inject *inject)
 {
 	struct evsel *pos;
 
@@ -1023,7 +1023,6 @@ int perf_event__inject_buildid(const struct perf_tool *tool, union perf_event *e
 
 	sample__for_each_callchain_node(thread, evsel, sample, PERF_MAX_STACK_DEPTH,
 					/*symbols=*/false, mark_dso_hit_callback, &args);
-
 	thread__put(thread);
 repipe:
 	perf_event__repipe(tool, event, sample, machine);
@@ -1432,6 +1431,7 @@ static int synthesize_build_id(struct perf_inject *inject, struct dso *dso, pid_
 {
 	struct machine *machine = perf_session__findnew_machine(inject->session, machine_pid);
 	struct perf_sample synth_sample = {
+		.evsel	   = inject__mmap_evsel(inject),
 		.pid	   = -1,
 		.tid	   = -1,
 		.time	   = -1,
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 622130d3aed4..42d4cc162039 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -2910,8 +2910,12 @@ static int print_event_with_time(const struct perf_tool *tool,
 		thread = machine__findnew_thread(machine, pid, tid);
 
 	if (evsel) {
+		struct evsel *saved_evsel = sample->evsel;
+
+		sample->evsel = evsel;
 		perf_sample__fprintf_start(script, sample, thread, evsel,
 					   event->header.type, stdout);
+		sample->evsel = saved_evsel;
 	}
 
 	perf_event__fprintf(event, machine, stdout);
diff --git a/tools/perf/tests/hists_cumulate.c b/tools/perf/tests/hists_cumulate.c
index 3eb9ef8d7ec6..606aa926a8fc 100644
--- a/tools/perf/tests/hists_cumulate.c
+++ b/tools/perf/tests/hists_cumulate.c
@@ -81,7 +81,7 @@ static int add_hist_entries(struct hists *hists, struct machine *machine)
 {
 	struct addr_location al;
 	struct evsel *evsel = hists_to_evsel(hists);
-	struct perf_sample sample = { .period = 1000, };
+	struct perf_sample sample = { .evsel = evsel, .period = 1000, };
 	size_t i;
 
 	addr_location__init(&al);
diff --git a/tools/perf/tests/hists_filter.c b/tools/perf/tests/hists_filter.c
index 1cebd20cc91c..cc6b26e373d1 100644
--- a/tools/perf/tests/hists_filter.c
+++ b/tools/perf/tests/hists_filter.c
@@ -70,6 +70,7 @@ static int add_hist_entries(struct evlist *evlist,
 			};
 			struct hists *hists = evsel__hists(evsel);
 
+			sample.evsel = evsel;
 			/* make sure it has no filter at first */
 			hists->thread_filter = NULL;
 			hists->dso_filter = NULL;
diff --git a/tools/perf/tests/hists_output.c b/tools/perf/tests/hists_output.c
index ee5ec8bda60e..7818950d786e 100644
--- a/tools/perf/tests/hists_output.c
+++ b/tools/perf/tests/hists_output.c
@@ -51,7 +51,7 @@ static int add_hist_entries(struct hists *hists, struct machine *machine)
 {
 	struct addr_location al;
 	struct evsel *evsel = hists_to_evsel(hists);
-	struct perf_sample sample = { .period = 100, };
+	struct perf_sample sample = { .evsel = evsel, .period = 100, };
 	size_t i;
 
 	addr_location__init(&al);
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index df2392713edb..2ee87fd84d3e 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -3226,6 +3226,7 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event,
 	union u64_swap u;
 
 	perf_sample__init(data, /*all=*/true);
+	data->evsel = evsel;
 	data->cpu = data->pid = data->tid = -1;
 	data->stream_id = data->id = data->time = -1ULL;
 	data->period = evsel->core.attr.sample_period;
diff --git a/tools/perf/util/sample.c b/tools/perf/util/sample.c
index 2a30de4573f6..cf73329326d7 100644
--- a/tools/perf/util/sample.c
+++ b/tools/perf/util/sample.c
@@ -19,6 +19,7 @@ void perf_sample__init(struct perf_sample *sample, bool all)
 	if (all) {
 		memset(sample, 0, sizeof(*sample));
 	} else {
+		sample->evsel = NULL;
 		sample->user_regs = NULL;
 		sample->intr_regs = NULL;
 		sample->merged_callchain = false;
diff --git a/tools/perf/util/sample.h b/tools/perf/util/sample.h
index fea7c77ff802..3d27a0daef8f 100644
--- a/tools/perf/util/sample.h
+++ b/tools/perf/util/sample.h
@@ -5,6 +5,7 @@
 #include <linux/perf_event.h>
 #include <linux/types.h>
 
+struct evsel;
 struct machine;
 struct thread;
 
@@ -102,6 +103,8 @@ struct simd_flags {
  * and clean up these values.
  */
 struct perf_sample {
+	/** @evsel: Backward reference to the evsel used when constructing the sample. */
+	struct evsel *evsel;
 	/** @ip: The sample event PERF_SAMPLE_IP value. */
 	u64 ip;
 	/** @pid: The sample event PERF_SAMPLE_TID pid value. */
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 53fb2e628b71..7588cca110d2 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1264,8 +1264,9 @@ static int deliver_sample_value(struct evlist *evlist,
 				bool per_thread)
 {
 	struct perf_sample_id *sid = evlist__id2sid(evlist, v->id);
-	struct evsel *evsel;
+	struct evsel *saved_evsel = sample->evsel;
 	u64 *storage = NULL;
+	int ret;
 
 	if (sid) {
 		storage = perf_sample_id__get_period_storage(sid, sample->tid, per_thread);
@@ -1289,8 +1290,10 @@ static int deliver_sample_value(struct evlist *evlist,
 	if (!sample->period)
 		return 0;
 
-	evsel = container_of(sid->evsel, struct evsel, core);
-	return tool->sample(tool, event, sample, evsel, machine);
+	sample->evsel = container_of(sid->evsel, struct evsel, core);
+	ret = tool->sample(tool, event, sample, sample->evsel, machine);
+	sample->evsel = saved_evsel;
+	return ret;
 }
 
 static int deliver_sample_group(struct evlist *evlist,
@@ -1362,13 +1365,16 @@ static int evlist__deliver_deferred_callchain(struct evlist *evlist,
 					      struct machine *machine)
 {
 	struct deferred_event *de, *tmp;
-	struct evsel *evsel;
 	int ret = 0;
 
 	if (!tool->merge_deferred_callchains) {
-		evsel = evlist__id2evsel(evlist, sample->id);
-		return tool->callchain_deferred(tool, event, sample,
-						evsel, machine);
+		struct evsel *saved_evsel = sample->evsel;
+
+		sample->evsel = evlist__id2evsel(evlist, sample->id);
+		ret = tool->callchain_deferred(tool, event, sample,
+					       sample->evsel, machine);
+		sample->evsel = saved_evsel;
+		return ret;
 	}
 
 	list_for_each_entry_safe(de, tmp, &evlist->deferred_samples, list) {
@@ -1392,9 +1398,9 @@ static int evlist__deliver_deferred_callchain(struct evlist *evlist,
 		else
 			orig_sample.deferred_callchain = false;
 
-		evsel = evlist__id2evsel(evlist, orig_sample.id);
+		orig_sample.evsel = evlist__id2evsel(evlist, orig_sample.id);
 		ret = evlist__deliver_sample(evlist, tool, de->event,
-					     &orig_sample, evsel, machine);
+					     &orig_sample, orig_sample.evsel, machine);
 
 		perf_sample__exit(&orig_sample);
 		list_del(&de->list);
@@ -1417,7 +1423,6 @@ static int session__flush_deferred_samples(struct perf_session *session,
 	struct evlist *evlist = session->evlist;
 	struct machine *machine = &session->machines.host;
 	struct deferred_event *de, *tmp;
-	struct evsel *evsel;
 	int ret = 0;
 
 	list_for_each_entry_safe(de, tmp, &evlist->deferred_samples, list) {
@@ -1431,9 +1436,9 @@ static int session__flush_deferred_samples(struct perf_session *session,
 			break;
 		}
 
-		evsel = evlist__id2evsel(evlist, sample.id);
+		sample.evsel = evlist__id2evsel(evlist, sample.id);
 		ret = evlist__deliver_sample(evlist, tool, de->event,
-					     &sample, evsel, machine);
+					     &sample, sample.evsel, machine);
 
 		perf_sample__exit(&sample);
 		list_del(&de->list);
@@ -1458,8 +1463,12 @@ static int machines__deliver_event(struct machines *machines,
 
 	dump_event(evlist, event, file_offset, sample, file_path);
 
-	evsel = evlist__id2evsel(evlist, sample->id);
+	if (!sample->evsel)
+		sample->evsel = evlist__id2evsel(evlist, sample->id);
+	else
+		assert(sample->evsel == evlist__id2evsel(evlist, sample->id));
 
+	evsel = sample->evsel;
 	machine = machines__find_for_cpumode(machines, event, sample);
 
 	switch (event->header.type) {

From c9ef786c0970991578397043f1c819229e2b7197 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Fri, 3 Apr 2026 23:05:52 -0700
Subject: [PATCH 091/131] perf cgroup: Update metric leader in
 evlist__expand_cgroup

When the evlist is expanded the metric leader wasn't being updated. As
the original evsel is deleted this creates a use-after-free in
stat-shadow's prepare_metric. This was detected running the "perf stat
--bpf-counters --for-each-cgroup test" with sanitizers.

The change itself puts the copied evsel into the priv field (known
unused because of evsel__clone use) and then in a second pass over the
list updates the copied values using the priv pointer.

Fixes: d1c5a0e86a4e ("perf stat: Add --for-each-cgroup option")
Signed-off-by: Ian Rogers <irogers@google.com>
Acked-by: Sun Jian <sun.jian.kdev@gmail.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/cgroup.c | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c
index 040eb75f0804..1b5664d1481f 100644
--- a/tools/perf/util/cgroup.c
+++ b/tools/perf/util/cgroup.c
@@ -417,7 +417,6 @@ static bool has_pattern_string(const char *str)
 int evlist__expand_cgroup(struct evlist *evlist, const char *str, bool open_cgroup)
 {
 	struct evlist *orig_list, *tmp_list;
-	struct evsel *pos, *evsel, *leader;
 	struct rblist orig_metric_events;
 	struct cgroup *cgrp = NULL;
 	struct cgroup_name *cn;
@@ -452,6 +451,7 @@ int evlist__expand_cgroup(struct evlist *evlist, const char *str, bool open_cgro
 		goto out_err;
 
 	list_for_each_entry(cn, &cgroup_list, list) {
+		struct evsel *pos;
 		char *name;
 
 		if (!cn->used)
@@ -467,21 +467,37 @@ int evlist__expand_cgroup(struct evlist *evlist, const char *str, bool open_cgro
 		if (cgrp == NULL)
 			continue;
 
-		leader = NULL;
+		/* copy the list and set to the new cgroup. */
 		evlist__for_each_entry(orig_list, pos) {
-			evsel = evsel__clone(/*dest=*/NULL, pos);
+			struct evsel *evsel = evsel__clone(/*dest=*/NULL, pos);
+
 			if (evsel == NULL)
 				goto out_err;
 
+			/* stash the copy during the copying. */
+			pos->priv = evsel;
 			cgroup__put(evsel->cgrp);
 			evsel->cgrp = cgroup__get(cgrp);
 
-			if (evsel__is_group_leader(pos))
-				leader = evsel;
-			evsel__set_leader(evsel, leader);
-
 			evlist__add(tmp_list, evsel);
 		}
+		/* update leader information using stashed pointer to copy. */
+		evlist__for_each_entry(orig_list, pos) {
+			struct evsel *evsel = pos->priv;
+
+			if (evsel__leader(pos))
+				evsel__set_leader(evsel, evsel__leader(pos)->priv);
+
+			if (pos->metric_leader)
+				evsel->metric_leader = pos->metric_leader->priv;
+
+			if (pos->first_wildcard_match)
+				evsel->first_wildcard_match = pos->first_wildcard_match->priv;
+		}
+		/* the stashed copy is no longer used. */
+		evlist__for_each_entry(orig_list, pos)
+			pos->priv = NULL;
+
 		/* cgroup__new() has a refcount, release it here */
 		cgroup__put(cgrp);
 		nr_cgroups++;

From dc647eb00969cd213c84d6caee90c480317e857d Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Sat, 4 Apr 2026 22:16:44 -0700
Subject: [PATCH 092/131] perf test: Skip sched stats test for !root

Running perf sched stats requires root and it fails to open the
schedstat file for regular users.  Let's skip the test.

  $ perf sched stats true
  Failed to open /proc/sys/kernel/sched_schedstats

Reviewed-by: Ian Rogers <irogers@google.com>
Tested-by: Swapnil Sapkal <swapnil.sapkal@amd.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/tests/shell/perf_sched_stats.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/perf/tests/shell/perf_sched_stats.sh b/tools/perf/tests/shell/perf_sched_stats.sh
index bef7714ef37a..f13eb0a75b76 100755
--- a/tools/perf/tests/shell/perf_sched_stats.sh
+++ b/tools/perf/tests/shell/perf_sched_stats.sh
@@ -4,6 +4,11 @@
 
 set -e
 
+if [ "$(id -u)" != 0 ]; then
+  echo "[Skip] No root permission"
+  exit 2
+fi
+
 perfdata=$(mktemp /tmp/__perf_test_sched_stats.perf.data.XXXXX)
 perfdata2=$(mktemp /tmp/__perf_test_sched_stats.perf.data.XXXXX)
 

From bb7aeeaa2106c6cc31cc88a513249bb80018535d Mon Sep 17 00:00:00 2001
From: Thomas Richter <tmricht@linux.ibm.com>
Date: Tue, 7 Apr 2026 12:08:33 +0200
Subject: [PATCH 093/131] perf config: Rename
 symbol_conf::disable_add2line_warn

Rename member symbol_conf::disable_add2line_warn to
symbol_conf::addr2line_disable_warn to make it consistent with other
addr2line_xxx constants.

Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/builtin-diff.c     |  4 ++--
 tools/perf/util/addr2line.c   | 12 ++++++------
 tools/perf/util/block-info.c  |  2 +-
 tools/perf/util/libbfd.c      |  2 +-
 tools/perf/util/symbol_conf.h |  2 +-
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 69069926dd0b..35d599d5c9fa 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -1352,7 +1352,7 @@ static int cycles_printf(struct hist_entry *he, struct hist_entry *pair,
 	/*
 	 * Avoid printing the warning "addr2line_init failed for ..."
 	 */
-	symbol_conf.disable_add2line_warn = true;
+	symbol_conf.addr2line_disable_warn = true;
 
 	bi = block_he->block_info;
 
@@ -1986,7 +1986,7 @@ int cmd_diff(int argc, const char **argv)
 
 	if (compute == COMPUTE_STREAM) {
 		symbol_conf.show_branchflag_count = true;
-		symbol_conf.disable_add2line_warn = true;
+		symbol_conf.addr2line_disable_warn = true;
 		callchain_param.mode = CHAIN_FLAT;
 		callchain_param.key = CCKEY_SRCLINE;
 		callchain_param.branch_callstack = 1;
diff --git a/tools/perf/util/addr2line.c b/tools/perf/util/addr2line.c
index 31c0391fffa3..e9f084db0802 100644
--- a/tools/perf/util/addr2line.c
+++ b/tools/perf/util/addr2line.c
@@ -123,7 +123,7 @@ static enum cmd_a2l_style cmd_addr2line_configure(struct child_process *a2l, con
 			lines = 3;
 			pr_debug3("Detected binutils addr2line style\n");
 		} else {
-			if (!symbol_conf.disable_add2line_warn) {
+			if (!symbol_conf.addr2line_disable_warn) {
 				char *output = NULL;
 				size_t output_len;
 
@@ -310,7 +310,7 @@ int cmd__addr2line(const char *dso_name, u64 addr,
 	}
 
 	if (a2l == NULL) {
-		if (!symbol_conf.disable_add2line_warn)
+		if (!symbol_conf.addr2line_disable_warn)
 			pr_warning("%s %s: addr2line_subprocess_init failed\n", __func__, dso_name);
 		goto out;
 	}
@@ -330,7 +330,7 @@ int cmd__addr2line(const char *dso_name, u64 addr,
 	len = snprintf(buf, sizeof(buf), "%016"PRIx64"\n,\n", addr);
 	written = len > 0 ? write(a2l->in, buf, len) : -1;
 	if (written != len) {
-		if (!symbol_conf.disable_add2line_warn)
+		if (!symbol_conf.addr2line_disable_warn)
 			pr_warning("%s %s: could not send request\n", __func__, dso_name);
 		goto out;
 	}
@@ -339,7 +339,7 @@ int cmd__addr2line(const char *dso_name, u64 addr,
 	switch (read_addr2line_record(&io, cmd_a2l_style, dso_name, addr, /*first=*/true,
 				      &record_function, &record_filename, &record_line_nr)) {
 	case -1:
-		if (!symbol_conf.disable_add2line_warn)
+		if (!symbol_conf.addr2line_disable_warn)
 			pr_warning("%s %s: could not read first record\n", __func__, dso_name);
 		goto out;
 	case 0:
@@ -355,7 +355,7 @@ int cmd__addr2line(const char *dso_name, u64 addr,
 					      /*addr=*/1, /*first=*/true,
 					      NULL, NULL, NULL)) {
 		case -1:
-			if (!symbol_conf.disable_add2line_warn)
+			if (!symbol_conf.addr2line_disable_warn)
 				pr_warning("%s %s: could not read sentinel record\n",
 					   __func__, dso_name);
 			break;
@@ -363,7 +363,7 @@ int cmd__addr2line(const char *dso_name, u64 addr,
 			/* The sentinel as expected. */
 			break;
 		default:
-			if (!symbol_conf.disable_add2line_warn)
+			if (!symbol_conf.addr2line_disable_warn)
 				pr_warning("%s %s: unexpected record instead of sentinel",
 					   __func__, dso_name);
 			break;
diff --git a/tools/perf/util/block-info.c b/tools/perf/util/block-info.c
index 649392bee7ed..8d3a9a661f26 100644
--- a/tools/perf/util/block-info.c
+++ b/tools/perf/util/block-info.c
@@ -303,7 +303,7 @@ static int block_range_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
 	char buf[128];
 	char *start_line, *end_line;
 
-	symbol_conf.disable_add2line_warn = true;
+	symbol_conf.addr2line_disable_warn = true;
 
 	start_line = map__srcline(he->ms.map, bi->sym->start + bi->start,
 				  he->ms.sym);
diff --git a/tools/perf/util/libbfd.c b/tools/perf/util/libbfd.c
index 63ea3fb53e77..c1c12308cc12 100644
--- a/tools/perf/util/libbfd.c
+++ b/tools/perf/util/libbfd.c
@@ -233,7 +233,7 @@ int libbfd__addr2line(const char *dso_name, u64 addr,
 	}
 
 	if (a2l == NULL) {
-		if (!symbol_conf.disable_add2line_warn)
+		if (!symbol_conf.addr2line_disable_warn)
 			pr_warning("addr2line_init failed for %s\n", dso_name);
 		return 0;
 	}
diff --git a/tools/perf/util/symbol_conf.h b/tools/perf/util/symbol_conf.h
index ac1b444a8fd8..21a1f096d4f0 100644
--- a/tools/perf/util/symbol_conf.h
+++ b/tools/perf/util/symbol_conf.h
@@ -51,7 +51,7 @@ struct symbol_conf {
 			report_block,
 			report_individual_block,
 			inline_name,
-			disable_add2line_warn,
+			addr2line_disable_warn,
 			no_buildid_mmap2,
 			guest_code,
 			lazy_load_kernel_maps,

From 59f6de4e8f2295f8beb2857d8b87e67218e63538 Mon Sep 17 00:00:00 2001
From: Thomas Richter <tmricht@linux.ibm.com>
Date: Tue, 7 Apr 2026 12:08:34 +0200
Subject: [PATCH 094/131] perf config: Make symbol_conf::addr2line_disable_warn
 configurable

Make symbol_conf::addr2line_disable_warn configurable by reading
the perfconfig file.
Use section core and addr2line-disable-warn = value.
Update documentation.

Example:
 # perf config -l
 core.addr2line-timeout=5000
 core.addr2line-disable-warn=1
 #

Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Suggested-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/Documentation/perf-config.txt | 6 ++++++
 tools/perf/util/config.c                 | 3 +++
 2 files changed, 9 insertions(+)

diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt
index 642d1c490d9e..9b223f892829 100644
--- a/tools/perf/Documentation/perf-config.txt
+++ b/tools/perf/Documentation/perf-config.txt
@@ -210,6 +210,12 @@ core.*::
 		Sets a timeout (in milliseconds) for parsing /proc/<pid>/maps files.
 		Can be overridden by the --proc-map-timeout option on supported
 		subcommands. The default timeout is 500ms.
+	addr2line-disable-warn::
+		When set to 'true' disable all warnings from 'addr2line' output.
+		Default setting is 'false' to show these warnings.
+	addr2line-timeout::
+		Sets a timeout (in milliseconds) for parsing 'addr2line'
+		output.  The default timeout is 5s.
 
 tui.*, gtk.*::
 	Subcommands that can be configured here are 'top', 'report' and 'annotate'.
diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c
index 0452fbc6c085..8e30def2b1f7 100644
--- a/tools/perf/util/config.c
+++ b/tools/perf/util/config.c
@@ -461,6 +461,9 @@ static int perf_default_core_config(const char *var, const char *value)
 	if (!strcmp(var, "core.addr2line-timeout"))
 		addr2line_timeout_ms = strtoul(value, NULL, 10);
 
+	if (!strcmp(var, "core.addr2line-disable-warn"))
+		symbol_conf.addr2line_disable_warn = perf_config_bool(var, value);
+
 	/* Add other config variables here. */
 	return 0;
 }

From 83674a78293f113b47a042d4470c264f6aa54fd5 Mon Sep 17 00:00:00 2001
From: Thomas Richter <tmricht@linux.ibm.com>
Date: Tue, 7 Apr 2026 12:08:35 +0200
Subject: [PATCH 095/131] perf addr2line: Remove global variable
 addr2line_timeout_ms

Remove global variable addr2line_timeout_ms and add it as a member
to symbol_conf structure.

Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
Reviewed-by: Ian Rogers <irogers@google.com>
[namhyung: move the initialization to util/symbol.c]
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/addr2line.c   | 5 +----
 tools/perf/util/addr2line.h   | 2 --
 tools/perf/util/config.c      | 3 +--
 tools/perf/util/symbol.c      | 1 +
 tools/perf/util/symbol_conf.h | 1 +
 5 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/tools/perf/util/addr2line.c b/tools/perf/util/addr2line.c
index e9f084db0802..4b0d349ed334 100644
--- a/tools/perf/util/addr2line.c
+++ b/tools/perf/util/addr2line.c
@@ -18,9 +18,6 @@
 
 #define MAX_INLINE_NEST 1024
 
-/* If addr2line doesn't return data for 5 seconds then timeout. */
-int addr2line_timeout_ms = 5 * 1000;
-
 static int filename_split(char *filename, unsigned int *line_nr)
 {
 	char *sep;
@@ -335,7 +332,7 @@ int cmd__addr2line(const char *dso_name, u64 addr,
 		goto out;
 	}
 	io__init(&io, a2l->out, buf, sizeof(buf));
-	io.timeout_ms = addr2line_timeout_ms;
+	io.timeout_ms = symbol_conf.addr2line_timeout_ms;
 	switch (read_addr2line_record(&io, cmd_a2l_style, dso_name, addr, /*first=*/true,
 				      &record_function, &record_filename, &record_line_nr)) {
 	case -1:
diff --git a/tools/perf/util/addr2line.h b/tools/perf/util/addr2line.h
index d35a47ba8dab..75989a92f16b 100644
--- a/tools/perf/util/addr2line.h
+++ b/tools/perf/util/addr2line.h
@@ -8,8 +8,6 @@ struct dso;
 struct inline_node;
 struct symbol;
 
-extern int addr2line_timeout_ms;
-
 int cmd__addr2line(const char *dso_name, u64 addr,
 		   char **file, unsigned int *line_nr,
 		   struct dso *dso,
diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c
index 8e30def2b1f7..087002fb1b9b 100644
--- a/tools/perf/util/config.c
+++ b/tools/perf/util/config.c
@@ -19,7 +19,6 @@
 #include "util/hist.h"  /* perf_hist_config */
 #include "util/stat.h"  /* perf_stat__set_big_num */
 #include "util/evsel.h"  /* evsel__hw_names, evsel__use_bpf_counters */
-#include "util/addr2line.h"  /* addr2line_timeout_ms */
 #include "srcline.h"
 #include "build-id.h"
 #include "debug.h"
@@ -459,7 +458,7 @@ static int perf_default_core_config(const char *var, const char *value)
 		proc_map_timeout = strtoul(value, NULL, 10);
 
 	if (!strcmp(var, "core.addr2line-timeout"))
-		addr2line_timeout_ms = strtoul(value, NULL, 10);
+		symbol_conf.addr2line_timeout_ms = strtoul(value, NULL, 10);
 
 	if (!strcmp(var, "core.addr2line-disable-warn"))
 		symbol_conf.addr2line_disable_warn = perf_config_bool(var, value);
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index b4b30675688d..94745a12973f 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -69,6 +69,7 @@ struct symbol_conf symbol_conf = {
 	.event_group		= true,
 	.inline_name		= true,
 	.res_sample		= 0,
+	.addr2line_timeout_ms	= 5 * 1000,
 };
 
 struct map_list_node {
diff --git a/tools/perf/util/symbol_conf.h b/tools/perf/util/symbol_conf.h
index 21a1f096d4f0..6cd454d7c98e 100644
--- a/tools/perf/util/symbol_conf.h
+++ b/tools/perf/util/symbol_conf.h
@@ -80,6 +80,7 @@ struct symbol_conf {
 			*bt_stop_list_str;
 	const char		*addr2line_path;
 	enum a2l_style	addr2line_style[MAX_A2L_STYLE];
+	int             addr2line_timeout_ms;
 	unsigned long	time_quantum;
        struct strlist	*dso_list,
 			*comm_list,

From b01741b2854aef073a8106468903aba0cf4f8539 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 7 Apr 2026 19:08:36 -0700
Subject: [PATCH 096/131] perf maps: Move getting debug_file to verbose path

Getting debug_file can trigger warnings if not set. Avoid getting
these warnings by pushing the use under the controlling if.

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/maps.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c
index 4092211cff62..7dd6da9d1e4f 100644
--- a/tools/perf/util/maps.c
+++ b/tools/perf/util/maps.c
@@ -844,7 +844,6 @@ static int __maps__insert_sorted(struct maps *maps, unsigned int first_after_ind
 static int __maps__fixup_overlap_and_insert(struct maps *maps, struct map *new)
 {
 	int err = 0;
-	FILE *fp = debug_file();
 	unsigned int i, ni = INT_MAX; // Some gcc complain, but depends on maps_by_name...
 
 	if (!maps__maps_by_address_sorted(maps))
@@ -872,8 +871,8 @@ static int __maps__fixup_overlap_and_insert(struct maps *maps, struct map *new)
 				dso__name(map__dso(new)));
 		} else if (verbose >= 2) {
 			pr_debug("overlapping maps:\n");
-			map__fprintf(new, fp);
-			map__fprintf(pos, fp);
+			map__fprintf(new, debug_file());
+			map__fprintf(pos, debug_file());
 		}
 
 		if (maps_by_name)
@@ -894,7 +893,7 @@ static int __maps__fixup_overlap_and_insert(struct maps *maps, struct map *new)
 			map__set_end(before, map__start(new));
 
 			if (verbose >= 2 && !use_browser)
-				map__fprintf(before, fp);
+				map__fprintf(before, debug_file());
 		}
 		if (map__end(new) < map__end(pos)) {
 			/* The new map isn't as long as the existing map. */
@@ -912,7 +911,7 @@ static int __maps__fixup_overlap_and_insert(struct maps *maps, struct map *new)
 			       map__map_ip(after, map__end(new)));
 
 			if (verbose >= 2 && !use_browser)
-				map__fprintf(after, fp);
+				map__fprintf(after, debug_file());
 		}
 		/*
 		 * If adding one entry, for `before` or `after`, we can replace

From c4f3ff3289380437d26177e8f2fe4b7507816ee3 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 7 Apr 2026 19:08:37 -0700
Subject: [PATCH 097/131] perf maps: Fix fixup_overlap_and_insert that can
 break sorted by name order

When an entry in the address array is replaced, the corresponding name
entry is replaced. The entries names may sort differently and so it is
important that the sorted by name property be cleared on the maps.

Fixes: 0d11fab32714 ("perf maps: Fixup maps_by_name when modifying maps_by_address")
Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/maps.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c
index 7dd6da9d1e4f..b44bc41f51f3 100644
--- a/tools/perf/util/maps.c
+++ b/tools/perf/util/maps.c
@@ -955,6 +955,7 @@ static int __maps__fixup_overlap_and_insert(struct maps *maps, struct map *new)
 			if (maps_by_name) {
 				map__put(maps_by_name[ni]);
 				maps_by_name[ni] = map__get(new);
+				maps__set_maps_by_name_sorted(maps, false);
 			}
 
 			err = __maps__insert_sorted(maps, i + 1, after, NULL);
@@ -981,6 +982,7 @@ static int __maps__fixup_overlap_and_insert(struct maps *maps, struct map *new)
 				if (maps_by_name) {
 					map__put(maps_by_name[ni]);
 					maps_by_name[ni] = map__get(new);
+					maps__set_maps_by_name_sorted(maps, false);
 				}
 
 				check_invariants(maps);

From f552b132e4d5248715828e7e5c2bf7889bf05b2e Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Tue, 7 Apr 2026 19:08:38 -0700
Subject: [PATCH 098/131] perf maps: Fix copy_from that can break sorted by
 name order

When an parent is copied into a child the name array is populated in
address not name order. Make sure the name array isn't flagged as sorted.

Fixes: 659ad3492b91 ("perf maps: Switch from rbtree to lazily sorted array for addresses")
Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/maps.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/tools/perf/util/maps.c b/tools/perf/util/maps.c
index b44bc41f51f3..81a97ac34077 100644
--- a/tools/perf/util/maps.c
+++ b/tools/perf/util/maps.c
@@ -1081,16 +1081,9 @@ int maps__copy_from(struct maps *dest, struct maps *parent)
 				map__put(new);
 		}
 		maps__set_maps_by_address_sorted(dest, maps__maps_by_address_sorted(parent));
-		if (!err) {
-			RC_CHK_ACCESS(dest)->last_search_by_name_idx =
-				RC_CHK_ACCESS(parent)->last_search_by_name_idx;
-			maps__set_maps_by_name_sorted(dest,
-						dest_maps_by_name &&
-						maps__maps_by_name_sorted(parent));
-		} else {
-			RC_CHK_ACCESS(dest)->last_search_by_name_idx = 0;
-			maps__set_maps_by_name_sorted(dest, false);
-		}
+		RC_CHK_ACCESS(dest)->last_search_by_name_idx = 0;
+		/* Values were copied into the name array in address order. */
+		maps__set_maps_by_name_sorted(dest, false);
 	} else {
 		/* Unexpected copying to a maps containing entries. */
 		for (unsigned int i = 0; !err && i < n; i++) {

From d3e01be6daab9f76f3c8b0ffd556ed9f18275c22 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 8 Apr 2026 14:31:56 -0300
Subject: [PATCH 099/131] perf symbols: Make variable receiving result
 strrchr() const
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixing:

  util/symbol.c: In function ‘symbol__config_symfs’:
  util/symbol.c:2499:20: error: assignment discards ‘const’ qualifier from pointer target type [-Werror=discarded-qualifiers]
   2499 |         layout_str = strrchr(dir, ',');
        |

With recent gcc/glibc.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/symbol.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 94745a12973f..fcaeeddbbb6b 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -2493,7 +2493,7 @@ int symbol__config_symfs(const struct option *opt __maybe_unused,
 			 const char *dir, int unset __maybe_unused)
 {
 	char *bf = NULL;
-	char *layout_str;
+	const char *layout_str;
 	char *dir_copy;
 	int ret;
 

From e5cce1b9c82fbd48e2f1f7a25a9fad8ee228176f Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 8 Apr 2026 14:31:57 -0300
Subject: [PATCH 100/131] perf util: Kill die() prototype, dead for a long time

In fef2a735167a827a ("perf tools: Kill die()") the die() function was
removed, but not the prototype in util.h, now when building with
LIBPERL=1, during a 'make -C tools/perf build-test' routine test, it is
failing as perl likes die() calls and then this clashes with this
remnant, remove it.

Fixes: fef2a735167a827a ("perf tools: Kill die()")
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/util.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index 394dbfa944ac..e935438451b8 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -30,7 +30,6 @@ extern bool perf_guest;
 
 /* General helper functions */
 void usage(const char *err) __noreturn;
-void die(const char *err, ...) __noreturn __printf(1, 2);
 
 struct dirent;
 struct strlist;

From 046fd8206d820b71e7870f7b894b46f8a15ae974 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Wed, 8 Apr 2026 14:31:58 -0300
Subject: [PATCH 101/131] perf tools: Make more global variables static

`make check` will run sparse on the perf code base. A frequent warning
is "warning: symbol '...' was not declared. Should it be static?" Go
through and make global definitions without declarations static.

In some cases it is deliberate due to dlsym accessing the symbol, this
change doesn't clean up the missing declarations for perf test suites.

Sometimes things can opportunistically be made const.

Making somethings static exposed unused functions warnings, so
restructuring of ifdefs was necessary for that.

These changes reduce the size of the perf binary by 568 bytes.

Committer notes:

Refreshed the patch, the original one fell thru the cracks, updated the
size reduction.

Remove the trace-event-scripting.c changes, break the build, noticed
with container builds and with sashiko:

  https://sashiko.dev/#/patchset/20260401215306.2152898-1-acme%40kernel.org

Also make two variables static to address another sashiko review
comment:

  https://sashiko.dev/#/patchset/20260402001740.2220481-1-acme%40kernel.org

Signed-off-by: Ian Rogers <irogers@google.com>
Acked-by: Ankur Arora <ankur.a.arora@oracle.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Cc: Guo Ren <guoren@kernel.org>
Cc: Howard Chu <howardchu95@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Leo Yan <leo.yan@arm.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <pjw@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Yujie Liu <yujie.liu@intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/arch/common.c                      |  22 ++--
 tools/perf/arch/sh/include/dwarf-regs-table.h |   2 +-
 tools/perf/bench/breakpoint.c                 |   4 +-
 tools/perf/bench/mem-functions.c              |   2 +-
 tools/perf/bench/numa.c                       |   2 +-
 tools/perf/bench/uprobe.c                     |   2 +-
 tools/perf/builtin-c2c.c                      |   7 +-
 tools/perf/builtin-config.c                   |   2 +-
 tools/perf/builtin-data.c                     |   8 +-
 tools/perf/builtin-diff.c                     |   4 +-
 tools/perf/builtin-kmem.c                     |   2 +-
 tools/perf/builtin-kwork.c                    |  12 +-
 tools/perf/builtin-script.c                   |   2 +-
 tools/perf/builtin-top.c                      |   5 +-
 tools/perf/tests/bp_signal.c                  |   2 +-
 tools/perf/tests/dso-data.c                   |   2 +-
 tools/perf/tests/wp.c                         |   6 +-
 tools/perf/util/block-range.c                 |   2 +-
 tools/perf/util/bpf_counter.c                 |   4 +-
 tools/perf/util/bpf_off_cpu.c                 |   2 +-
 tools/perf/util/debug.c                       |   2 +-
 tools/perf/util/debuginfo.c                   |  19 ++--
 tools/perf/util/sort.c                        | 104 +++++++++---------
 tools/perf/util/util.c                        |   2 -
 24 files changed, 109 insertions(+), 112 deletions(-)

diff --git a/tools/perf/arch/common.c b/tools/perf/arch/common.c
index 4908d54dd33b..21836f70f231 100644
--- a/tools/perf/arch/common.c
+++ b/tools/perf/arch/common.c
@@ -9,14 +9,14 @@
 #include "../util/debug.h"
 #include <linux/zalloc.h>
 
-const char *const arc_triplets[] = {
+static const char *const arc_triplets[] = {
 	"arc-linux-",
 	"arc-snps-linux-uclibc-",
 	"arc-snps-linux-gnu-",
 	NULL
 };
 
-const char *const arm_triplets[] = {
+static const char *const arm_triplets[] = {
 	"arm-eabi-",
 	"arm-linux-androideabi-",
 	"arm-unknown-linux-",
@@ -28,13 +28,13 @@ const char *const arm_triplets[] = {
 	NULL
 };
 
-const char *const arm64_triplets[] = {
+static const char *const arm64_triplets[] = {
 	"aarch64-linux-android-",
 	"aarch64-linux-gnu-",
 	NULL
 };
 
-const char *const powerpc_triplets[] = {
+static const char *const powerpc_triplets[] = {
 	"powerpc-unknown-linux-gnu-",
 	"powerpc-linux-gnu-",
 	"powerpc64-unknown-linux-gnu-",
@@ -43,40 +43,40 @@ const char *const powerpc_triplets[] = {
 	NULL
 };
 
-const char *const riscv32_triplets[] = {
+static const char *const riscv32_triplets[] = {
 	"riscv32-unknown-linux-gnu-",
 	"riscv32-linux-android-",
 	"riscv32-linux-gnu-",
 	NULL
 };
 
-const char *const riscv64_triplets[] = {
+static const char *const riscv64_triplets[] = {
 	"riscv64-unknown-linux-gnu-",
 	"riscv64-linux-android-",
 	"riscv64-linux-gnu-",
 	NULL
 };
 
-const char *const s390_triplets[] = {
+static const char *const s390_triplets[] = {
 	"s390-ibm-linux-",
 	"s390x-linux-gnu-",
 	NULL
 };
 
-const char *const sh_triplets[] = {
+static const char *const sh_triplets[] = {
 	"sh-unknown-linux-gnu-",
 	"sh-linux-gnu-",
 	NULL
 };
 
-const char *const sparc_triplets[] = {
+static const char *const sparc_triplets[] = {
 	"sparc-unknown-linux-gnu-",
 	"sparc64-unknown-linux-gnu-",
 	"sparc64-linux-gnu-",
 	NULL
 };
 
-const char *const x86_triplets[] = {
+static const char *const x86_triplets[] = {
 	"x86_64-pc-linux-gnu-",
 	"x86_64-unknown-linux-gnu-",
 	"i686-pc-linux-gnu-",
@@ -90,7 +90,7 @@ const char *const x86_triplets[] = {
 	NULL
 };
 
-const char *const mips_triplets[] = {
+static const char *const mips_triplets[] = {
 	"mips-unknown-linux-gnu-",
 	"mipsel-linux-android-",
 	"mips-linux-gnu-",
diff --git a/tools/perf/arch/sh/include/dwarf-regs-table.h b/tools/perf/arch/sh/include/dwarf-regs-table.h
index 900e69619970..b5974a090fb4 100644
--- a/tools/perf/arch/sh/include/dwarf-regs-table.h
+++ b/tools/perf/arch/sh/include/dwarf-regs-table.h
@@ -2,7 +2,7 @@
 #ifdef DEFINE_DWARF_REGSTR_TABLE
 /* This is included in perf/util/dwarf-regs.c */
 
-const char * const sh_regstr_tbl[] = {
+static const char * const sh_regstr_tbl[] = {
 	"r0",
 	"r1",
 	"r2",
diff --git a/tools/perf/bench/breakpoint.c b/tools/perf/bench/breakpoint.c
index dfd18f5db97d..1b7cd4481bd2 100644
--- a/tools/perf/bench/breakpoint.c
+++ b/tools/perf/bench/breakpoint.c
@@ -16,7 +16,7 @@
 #include "bench.h"
 #include "futex.h"
 
-struct {
+static struct {
 	unsigned int nbreakpoints;
 	unsigned int nparallel;
 	unsigned int nthreads;
@@ -173,7 +173,7 @@ int bench_breakpoint_thread(int argc, const char **argv)
 	return 0;
 }
 
-struct {
+static struct {
 	unsigned int npassive;
 	unsigned int nactive;
 } enable_params = {
diff --git a/tools/perf/bench/mem-functions.c b/tools/perf/bench/mem-functions.c
index f5ab41bb85bf..5ede52853953 100644
--- a/tools/perf/bench/mem-functions.c
+++ b/tools/perf/bench/mem-functions.c
@@ -399,7 +399,7 @@ static void mem_free(struct bench_mem_info *info __maybe_unused,
 	*dst = *src = NULL;
 }
 
-struct function memcpy_functions[] = {
+static struct function memcpy_functions[] = {
 	{ .name		= "default",
 	  .desc		= "Default memcpy() provided by glibc",
 	  .fn.init	= mem_alloc,
diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c
index 19be2aaf4dc0..6588a9b0b15a 100644
--- a/tools/perf/bench/numa.c
+++ b/tools/perf/bench/numa.c
@@ -166,7 +166,7 @@ static struct global_info	*g = NULL;
 static int parse_cpus_opt(const struct option *opt, const char *arg, int unset);
 static int parse_nodes_opt(const struct option *opt, const char *arg, int unset);
 
-struct params p0;
+static struct params p0;
 
 static const struct option options[] = {
 	OPT_INTEGER('p', "nr_proc"	, &p0.nr_proc,		"number of processes"),
diff --git a/tools/perf/bench/uprobe.c b/tools/perf/bench/uprobe.c
index c4dac868f1ee..89697ff788ef 100644
--- a/tools/perf/bench/uprobe.c
+++ b/tools/perf/bench/uprobe.c
@@ -58,7 +58,7 @@ static const char * const bench_uprobe_usage[] = {
 		goto cleanup; \
 	}
 
-struct bench_uprobe_bpf *skel;
+static struct bench_uprobe_bpf *skel;
 
 static int bench_uprobe__setup_bpf_skel(enum bench_uprobe bench)
 {
diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c
index e60eea62c2fc..3ce5f0adec2f 100644
--- a/tools/perf/builtin-c2c.c
+++ b/tools/perf/builtin-c2c.c
@@ -2892,9 +2892,10 @@ static int ui_quirks(void)
 
 #define CALLCHAIN_DEFAULT_OPT  "graph,0.5,caller,function,percent"
 
-const char callchain_help[] = "Display call graph (stack chain/backtrace):\n\n"
-				CALLCHAIN_REPORT_HELP
-				"\n\t\t\t\tDefault: " CALLCHAIN_DEFAULT_OPT;
+static const char callchain_help[] =
+	"Display call graph (stack chain/backtrace):\n\n"
+	CALLCHAIN_REPORT_HELP
+	"\n\t\t\t\tDefault: " CALLCHAIN_DEFAULT_OPT;
 
 static int
 parse_callchain_opt(const struct option *opt, const char *arg, int unset)
diff --git a/tools/perf/builtin-config.c b/tools/perf/builtin-config.c
index 45b5312fbe83..237600643bbd 100644
--- a/tools/perf/builtin-config.c
+++ b/tools/perf/builtin-config.c
@@ -23,7 +23,7 @@ static const char * const config_usage[] = {
 	NULL
 };
 
-enum actions {
+static enum actions {
 	ACTION_LIST = 1
 } actions;
 
diff --git a/tools/perf/builtin-data.c b/tools/perf/builtin-data.c
index 85f59886b5cf..4c08ccb8c06b 100644
--- a/tools/perf/builtin-data.c
+++ b/tools/perf/builtin-data.c
@@ -28,15 +28,15 @@ static const char *data_usage[] = {
 	NULL
 };
 
-const char *to_json;
-const char *to_ctf;
-struct perf_data_convert_opts opts = {
+static const char *to_json;
+static const char *to_ctf;
+static struct perf_data_convert_opts opts = {
 	.force = false,
 	.all = false,
 	.time_str = NULL,
 };
 
-const struct option data_options[] = {
+static const struct option data_options[] = {
 		OPT_INCR('v', "verbose", &verbose, "be more verbose"),
 		OPT_STRING('i', "input", &input_name, "file", "input file name"),
 		OPT_STRING(0, "to-json", &to_json, NULL, "Convert to JSON format"),
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 35d599d5c9fa..2c59e43901fe 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -113,7 +113,7 @@ enum {
 	COMPUTE_STREAM,	/* After COMPUTE_MAX to avoid use current compute arrays */
 };
 
-const char *compute_names[COMPUTE_MAX] = {
+static const char *compute_names[COMPUTE_MAX] = {
 	[COMPUTE_DELTA] = "delta",
 	[COMPUTE_DELTA_ABS] = "delta-abs",
 	[COMPUTE_RATIO] = "ratio",
@@ -382,7 +382,7 @@ static void block_hist_free(void *he)
 	free(bh);
 }
 
-struct hist_entry_ops block_hist_ops = {
+static struct hist_entry_ops block_hist_ops = {
 	.new    = block_hist_zalloc,
 	.free   = block_hist_free,
 };
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 7929a5fa5f46..9c64a0d74823 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -82,7 +82,7 @@ static unsigned long nr_allocs, nr_cross_allocs;
 
 /* filters for controlling start and stop of time of analysis */
 static struct perf_time_interval ptime;
-const char *time_str;
+static const char *time_str;
 
 static int insert_alloc_stat(unsigned long call_site, unsigned long ptr,
 			     int bytes_req, int bytes_alloc, int cpu)
diff --git a/tools/perf/builtin-kwork.c b/tools/perf/builtin-kwork.c
index 6f94a8f45f60..1140e00e874f 100644
--- a/tools/perf/builtin-kwork.c
+++ b/tools/perf/builtin-kwork.c
@@ -985,7 +985,7 @@ static int process_irq_handler_exit_event(const struct perf_tool *tool,
 	return 0;
 }
 
-const struct evsel_str_handler irq_tp_handlers[] = {
+static const struct evsel_str_handler irq_tp_handlers[] = {
 	{ "irq:irq_handler_entry", process_irq_handler_entry_event, },
 	{ "irq:irq_handler_exit",  process_irq_handler_exit_event,  },
 };
@@ -1080,7 +1080,7 @@ static int process_softirq_exit_event(const struct perf_tool *tool,
 	return 0;
 }
 
-const struct evsel_str_handler softirq_tp_handlers[] = {
+static const struct evsel_str_handler softirq_tp_handlers[] = {
 	{ "irq:softirq_raise", process_softirq_raise_event, },
 	{ "irq:softirq_entry", process_softirq_entry_event, },
 	{ "irq:softirq_exit",  process_softirq_exit_event,  },
@@ -1211,7 +1211,7 @@ static int process_workqueue_execute_end_event(const struct perf_tool *tool,
 	return 0;
 }
 
-const struct evsel_str_handler workqueue_tp_handlers[] = {
+static const struct evsel_str_handler workqueue_tp_handlers[] = {
 	{ "workqueue:workqueue_activate_work", process_workqueue_activate_work_event, },
 	{ "workqueue:workqueue_execute_start", process_workqueue_execute_start_event, },
 	{ "workqueue:workqueue_execute_end",   process_workqueue_execute_end_event,   },
@@ -1281,7 +1281,7 @@ static int process_sched_switch_event(const struct perf_tool *tool,
 	return 0;
 }
 
-const struct evsel_str_handler sched_tp_handlers[] = {
+static const struct evsel_str_handler sched_tp_handlers[] = {
 	{ "sched:sched_switch",  process_sched_switch_event, },
 };
 
@@ -1561,13 +1561,13 @@ static void print_bad_events(struct perf_kwork *kwork)
 	}
 }
 
-const char *graph_load = "||||||||||||||||||||||||||||||||||||||||||||||||";
-const char *graph_idle = "                                                ";
 static void top_print_per_cpu_load(struct perf_kwork *kwork)
 {
 	int i, load_width;
 	u64 total, load, load_ratio;
 	struct kwork_top_stat *stat = &kwork->top_stat;
+	const char *graph_load = "||||||||||||||||||||||||||||||||||||||||||||||||";
+	const char *graph_idle = "                                                ";
 
 	for (i = 0; i < MAX_NR_CPUS; i++) {
 		total = stat->cpus_runtime[i].total;
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 42d4cc162039..43ce119dac3e 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -166,7 +166,7 @@ struct perf_script {
 	int			range_num;
 };
 
-struct output_option {
+static struct output_option {
 	const char *str;
 	enum perf_output_field field;
 } all_output_options[] = {
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 37950efb28ac..f6eb543de537 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1449,11 +1449,10 @@ parse_percent_limit(const struct option *opt, const char *arg,
 	return 0;
 }
 
-const char top_callchain_help[] = CALLCHAIN_RECORD_HELP CALLCHAIN_REPORT_HELP
-	"\n\t\t\t\tDefault: fp,graph,0.5,caller,function";
-
 int cmd_top(int argc, const char **argv)
 {
+	static const char top_callchain_help[] = CALLCHAIN_RECORD_HELP CALLCHAIN_REPORT_HELP
+		"\n\t\t\t\tDefault: fp,graph,0.5,caller,function";
 	char errbuf[BUFSIZ];
 	struct perf_top top = {
 		.count_filter	     = 5,
diff --git a/tools/perf/tests/bp_signal.c b/tools/perf/tests/bp_signal.c
index 3faeb5b6fe0b..f580ba7486b1 100644
--- a/tools/perf/tests/bp_signal.c
+++ b/tools/perf/tests/bp_signal.c
@@ -36,7 +36,7 @@ static int fd3;
 static int overflows;
 static int overflows_2;
 
-volatile long the_var;
+static volatile long the_var;
 
 
 /*
diff --git a/tools/perf/tests/dso-data.c b/tools/perf/tests/dso-data.c
index a1fff4203b75..46bc3f597260 100644
--- a/tools/perf/tests/dso-data.c
+++ b/tools/perf/tests/dso-data.c
@@ -58,7 +58,7 @@ struct test_data_offset {
 	int size;
 };
 
-struct test_data_offset offsets[] = {
+static struct test_data_offset offsets[] = {
 	/* Fill first cache page. */
 	{
 		.offset = 10,
diff --git a/tools/perf/tests/wp.c b/tools/perf/tests/wp.c
index 6c178985e37f..69b31f00eed0 100644
--- a/tools/perf/tests/wp.c
+++ b/tools/perf/tests/wp.c
@@ -22,11 +22,11 @@ do {                                            \
 
 #ifdef __i386__
 /* Only breakpoint length less-than 8 has hardware support on i386. */
-volatile u32 data1;
+static volatile u32 data1;
 #else
-volatile u64 data1;
+static volatile u64 data1;
 #endif
-volatile u8 data2[3];
+static volatile u8 data2[3];
 
 #ifndef __s390x__
 static int wp_read(int fd, long long *count, int size)
diff --git a/tools/perf/util/block-range.c b/tools/perf/util/block-range.c
index 15c42196c24c..7c559fcfd7e0 100644
--- a/tools/perf/util/block-range.c
+++ b/tools/perf/util/block-range.c
@@ -4,7 +4,7 @@
 #include <assert.h>
 #include <stdlib.h>
 
-struct {
+static struct {
 	struct rb_root root;
 	u64 blocks;
 } block_ranges;
diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c
index 2ffd7aefb6eb..34b6b0da18b7 100644
--- a/tools/perf/util/bpf_counter.c
+++ b/tools/perf/util/bpf_counter.c
@@ -353,7 +353,7 @@ static int bpf_program_profiler__install_pe(struct evsel *evsel, int cpu_map_idx
 	return 0;
 }
 
-struct bpf_counter_ops bpf_program_profiler_ops = {
+static struct bpf_counter_ops bpf_program_profiler_ops = {
 	.load       = bpf_program_profiler__load,
 	.enable	    = bpf_program_profiler__enable,
 	.disable    = bpf_program_profiler__disable,
@@ -833,7 +833,7 @@ static int bperf__destroy(struct evsel *evsel)
  * the leader prog.
  */
 
-struct bpf_counter_ops bperf_ops = {
+static struct bpf_counter_ops bperf_ops = {
 	.load       = bperf__load,
 	.enable     = bperf__enable,
 	.disable    = bperf__disable,
diff --git a/tools/perf/util/bpf_off_cpu.c b/tools/perf/util/bpf_off_cpu.c
index 0891d9c73660..a3b699a5322f 100644
--- a/tools/perf/util/bpf_off_cpu.c
+++ b/tools/perf/util/bpf_off_cpu.c
@@ -39,7 +39,7 @@ union off_cpu_data {
 	u64 array[1024 / sizeof(u64)];
 };
 
-u64 off_cpu_raw[MAX_STACKS + 5];
+static u64 off_cpu_raw[MAX_STACKS + 5];
 
 static int off_cpu_config(struct evlist *evlist)
 {
diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c
index 1dfa4d0eec4d..6b5ffe81f141 100644
--- a/tools/perf/util/debug.c
+++ b/tools/perf/util/debug.c
@@ -48,7 +48,7 @@ int debug_ordered_events;
 static int redirect_to_stderr;
 int debug_data_convert;
 static FILE *_debug_file;
-bool debug_display_time;
+static bool debug_display_time;
 int debug_type_profile;
 
 FILE *debug_file(void)
diff --git a/tools/perf/util/debuginfo.c b/tools/perf/util/debuginfo.c
index 4a559b3e8cdc..0e35c13abd04 100644
--- a/tools/perf/util/debuginfo.c
+++ b/tools/perf/util/debuginfo.c
@@ -88,18 +88,17 @@ static struct debuginfo *__debuginfo__new(const char *path)
 	return dbg;
 }
 
-enum dso_binary_type distro_dwarf_types[] = {
-	DSO_BINARY_TYPE__FEDORA_DEBUGINFO,
-	DSO_BINARY_TYPE__UBUNTU_DEBUGINFO,
-	DSO_BINARY_TYPE__OPENEMBEDDED_DEBUGINFO,
-	DSO_BINARY_TYPE__BUILDID_DEBUGINFO,
-	DSO_BINARY_TYPE__MIXEDUP_UBUNTU_DEBUGINFO,
-	DSO_BINARY_TYPE__NOT_FOUND,
-};
-
 struct debuginfo *debuginfo__new(const char *path)
 {
-	enum dso_binary_type *type;
+	static const enum dso_binary_type distro_dwarf_types[] = {
+		DSO_BINARY_TYPE__FEDORA_DEBUGINFO,
+		DSO_BINARY_TYPE__UBUNTU_DEBUGINFO,
+		DSO_BINARY_TYPE__OPENEMBEDDED_DEBUGINFO,
+		DSO_BINARY_TYPE__BUILDID_DEBUGINFO,
+		DSO_BINARY_TYPE__MIXEDUP_UBUNTU_DEBUGINFO,
+		DSO_BINARY_TYPE__NOT_FOUND,
+	};
+	const enum dso_binary_type *type;
 	char buf[PATH_MAX], nil = '\0';
 	struct dso *dso;
 	struct debuginfo *dinfo = NULL;
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 5c9656cc4f9d..6ce684d68bd6 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -44,11 +44,11 @@ regex_t		parent_regex;
 const char	default_parent_pattern[] = "^sys_|^do_page_fault";
 const char	*parent_pattern = default_parent_pattern;
 const char	*default_sort_order = "comm,dso,symbol";
-const char	default_branch_sort_order[] = "comm,dso_from,symbol_from,symbol_to,cycles";
+static const char	default_branch_sort_order[] = "comm,dso_from,symbol_from,symbol_to,cycles";
 const char	default_mem_sort_order[] = "local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked,blocked,local_ins_lat,local_p_stage_cyc";
-const char	default_top_sort_order[] = "dso,symbol";
-const char	default_diff_sort_order[] = "dso,symbol";
-const char	default_tracepoint_sort_order[] = "trace";
+static const char	default_top_sort_order[] = "dso,symbol";
+static const char	default_diff_sort_order[] = "dso,symbol";
+static const char	default_tracepoint_sort_order[] = "trace";
 const char	*sort_order;
 const char	*field_order;
 regex_t		ignore_callees_regex;
@@ -173,7 +173,7 @@ static int hist_entry__tgid_snprintf(struct hist_entry *he, char *bf,
 	return repsep_snprintf(bf, size, "%7d:%-*.*s", tgid, width, width, comm ?: "");
 }
 
-struct sort_entry sort_tgid = {
+static struct sort_entry sort_tgid = {
 	.se_header	= "   Tgid:Command",
 	.se_cmp		= sort__tgid_cmp,
 	.se_snprintf	= hist_entry__tgid_snprintf,
@@ -219,7 +219,7 @@ static int hist_entry__simd_snprintf(struct hist_entry *he, char *bf,
 	return repsep_snprintf(bf, size, "[.] %s", name);
 }
 
-struct sort_entry sort_simd = {
+static struct sort_entry sort_simd = {
 	.se_header	= "Simd   ",
 	.se_cmp		= sort__simd_cmp,
 	.se_snprintf	= hist_entry__simd_snprintf,
@@ -605,7 +605,7 @@ hist_entry__symoff_snprintf(struct hist_entry *he, char *bf, size_t size, unsign
 	return repsep_snprintf(bf, size, "[%c] %s+0x%llx", he->level, sym->name, he->ip - sym->start);
 }
 
-struct sort_entry sort_sym_offset = {
+static struct sort_entry sort_sym_offset = {
 	.se_header	= "Symbol Offset",
 	.se_cmp		= sort__symoff_cmp,
 	.se_sort	= sort__symoff_sort,
@@ -716,7 +716,7 @@ static int hist_entry__srcline_from_snprintf(struct hist_entry *he, char *bf,
 	return repsep_snprintf(bf, size, "%-*.*s", width, width, he->branch_info->srcline_from);
 }
 
-struct sort_entry sort_srcline_from = {
+static struct sort_entry sort_srcline_from = {
 	.se_header	= "From Source:Line",
 	.se_cmp		= sort__srcline_from_cmp,
 	.se_collapse	= sort__srcline_from_collapse,
@@ -764,7 +764,7 @@ static int hist_entry__srcline_to_snprintf(struct hist_entry *he, char *bf,
 	return repsep_snprintf(bf, size, "%-*.*s", width, width, he->branch_info->srcline_to);
 }
 
-struct sort_entry sort_srcline_to = {
+static struct sort_entry sort_srcline_to = {
 	.se_header	= "To Source:Line",
 	.se_cmp		= sort__srcline_to_cmp,
 	.se_collapse	= sort__srcline_to_collapse,
@@ -800,7 +800,7 @@ static int hist_entry__sym_ipc_snprintf(struct hist_entry *he, char *bf,
 	return repsep_snprintf(bf, size, "%-*s", width, tmp);
 }
 
-struct sort_entry sort_sym_ipc = {
+static struct sort_entry sort_sym_ipc = {
 	.se_header	= "IPC   [IPC Coverage]",
 	.se_cmp		= sort__sym_cmp,
 	.se_snprintf	= hist_entry__sym_ipc_snprintf,
@@ -818,7 +818,7 @@ static int hist_entry__sym_ipc_null_snprintf(struct hist_entry *he
 	return repsep_snprintf(bf, size, "%-*s", width, tmp);
 }
 
-struct sort_entry sort_sym_ipc_null = {
+static struct sort_entry sort_sym_ipc_null = {
 	.se_header	= "IPC   [IPC Coverage]",
 	.se_cmp		= sort__sym_cmp,
 	.se_snprintf	= hist_entry__sym_ipc_null_snprintf,
@@ -851,7 +851,7 @@ static int hist_entry__callchain_branch_predicted_snprintf(
 	return repsep_snprintf(bf, size, "%-*.*s", width, width, str);
 }
 
-struct sort_entry sort_callchain_branch_predicted = {
+static struct sort_entry sort_callchain_branch_predicted = {
 	.se_header	= "Predicted",
 	.se_cmp		= sort__callchain_branch_predicted_cmp,
 	.se_snprintf	= hist_entry__callchain_branch_predicted_snprintf,
@@ -881,7 +881,7 @@ static int hist_entry__callchain_branch_abort_snprintf(struct hist_entry *he,
 	return repsep_snprintf(bf, size, "%-*.*s", width, width, str);
 }
 
-struct sort_entry sort_callchain_branch_abort = {
+static struct sort_entry sort_callchain_branch_abort = {
 	.se_header	= "Abort",
 	.se_cmp		= sort__callchain_branch_abort_cmp,
 	.se_snprintf	= hist_entry__callchain_branch_abort_snprintf,
@@ -914,7 +914,7 @@ static int hist_entry__callchain_branch_cycles_snprintf(struct hist_entry *he,
 	return repsep_snprintf(bf, size, "%-*.*s", width, width, str);
 }
 
-struct sort_entry sort_callchain_branch_cycles = {
+static struct sort_entry sort_callchain_branch_cycles = {
 	.se_header	= "Cycles",
 	.se_cmp		= sort__callchain_branch_cycles_cmp,
 	.se_snprintf	= hist_entry__callchain_branch_cycles_snprintf,
@@ -981,7 +981,7 @@ static int hist_entry__srcfile_snprintf(struct hist_entry *he, char *bf,
 	return repsep_snprintf(bf, size, "%-.*s", width, he->srcfile);
 }
 
-struct sort_entry sort_srcfile = {
+static struct sort_entry sort_srcfile = {
 	.se_header	= "Source File",
 	.se_cmp		= sort__srcfile_cmp,
 	.se_collapse	= sort__srcfile_collapse,
@@ -1033,7 +1033,7 @@ static int hist_entry__cpu_snprintf(struct hist_entry *he, char *bf,
 	return repsep_snprintf(bf, size, "%*.*d", width, width, he->cpu);
 }
 
-struct sort_entry sort_cpu = {
+static struct sort_entry sort_cpu = {
 	.se_header      = "CPU",
 	.se_cmp	        = sort__cpu_cmp,
 	.se_snprintf    = hist_entry__cpu_snprintf,
@@ -1064,7 +1064,7 @@ static int hist_entry__parallelism_snprintf(struct hist_entry *he, char *bf,
 	return repsep_snprintf(bf, size, "%*d", width, he->parallelism);
 }
 
-struct sort_entry sort_parallelism = {
+static struct sort_entry sort_parallelism = {
 	.se_header      = "Parallelism",
 	.se_cmp	        = sort__parallelism_cmp,
 	.se_filter	= hist_entry__parallelism_filter,
@@ -1105,7 +1105,7 @@ static int hist_entry__cgroup_id_snprintf(struct hist_entry *he,
 			       he->cgroup_id.ino);
 }
 
-struct sort_entry sort_cgroup_id = {
+static struct sort_entry sort_cgroup_id = {
 	.se_header      = "cgroup id (dev/inode)",
 	.se_cmp	        = sort__cgroup_id_cmp,
 	.se_snprintf    = hist_entry__cgroup_id_snprintf,
@@ -1138,7 +1138,7 @@ static int hist_entry__cgroup_snprintf(struct hist_entry *he,
 	return repsep_snprintf(bf, size, "%s", cgrp_name);
 }
 
-struct sort_entry sort_cgroup = {
+static struct sort_entry sort_cgroup = {
 	.se_header      = "Cgroup",
 	.se_cmp	        = sort__cgroup_cmp,
 	.se_snprintf    = hist_entry__cgroup_snprintf,
@@ -1169,7 +1169,7 @@ static int hist_entry__socket_filter(struct hist_entry *he, int type, const void
 	return sk >= 0 && he->socket != sk;
 }
 
-struct sort_entry sort_socket = {
+static struct sort_entry sort_socket = {
 	.se_header      = "Socket",
 	.se_cmp	        = sort__socket_cmp,
 	.se_snprintf    = hist_entry__socket_snprintf,
@@ -1200,7 +1200,7 @@ static int hist_entry__time_snprintf(struct hist_entry *he, char *bf,
 	return repsep_snprintf(bf, size, "%-.*s", width, he_time);
 }
 
-struct sort_entry sort_time = {
+static struct sort_entry sort_time = {
 	.se_header      = "Time",
 	.se_cmp	        = sort__time_cmp,
 	.se_snprintf    = hist_entry__time_snprintf,
@@ -1269,7 +1269,7 @@ static int hist_entry__trace_snprintf(struct hist_entry *he, char *bf,
 	return repsep_snprintf(bf, size, "%-.*s", width, he->trace_output);
 }
 
-struct sort_entry sort_trace = {
+static struct sort_entry sort_trace = {
 	.se_header      = "Trace output",
 	.se_cmp	        = sort__trace_cmp,
 	.se_snprintf    = hist_entry__trace_snprintf,
@@ -1564,7 +1564,7 @@ sort__addr_to_cmp(struct hist_entry *left, struct hist_entry *right)
 	return _sort__addr_cmp(to_l->addr, to_r->addr);
 }
 
-struct sort_entry sort_addr_from = {
+static struct sort_entry sort_addr_from = {
 	.se_header	= "Source Address",
 	.se_cmp		= sort__addr_from_cmp,
 	.se_snprintf	= hist_entry__addr_from_snprintf,
@@ -1572,7 +1572,7 @@ struct sort_entry sort_addr_from = {
 	.se_width_idx	= HISTC_ADDR_FROM,
 };
 
-struct sort_entry sort_addr_to = {
+static struct sort_entry sort_addr_to = {
 	.se_header	= "Target Address",
 	.se_cmp		= sort__addr_to_cmp,
 	.se_snprintf	= hist_entry__addr_to_snprintf,
@@ -1629,7 +1629,7 @@ static int hist_entry__cycles_snprintf(struct hist_entry *he, char *bf,
 			       he->branch_info->flags.cycles);
 }
 
-struct sort_entry sort_cycles = {
+static struct sort_entry sort_cycles = {
 	.se_header	= "Basic Block Cycles",
 	.se_cmp		= sort__cycles_cmp,
 	.se_snprintf	= hist_entry__cycles_snprintf,
@@ -1919,7 +1919,7 @@ static int hist_entry__dcacheline_snprintf(struct hist_entry *he, char *bf,
 	return _hist_entry__sym_snprintf(ms, addr, level, bf, size, width);
 }
 
-struct sort_entry sort_mispredict = {
+static struct sort_entry sort_mispredict = {
 	.se_header	= "Branch Mispredicted",
 	.se_cmp		= sort__mispredict_cmp,
 	.se_snprintf	= hist_entry__mispredict_snprintf,
@@ -1938,7 +1938,7 @@ static int hist_entry__local_weight_snprintf(struct hist_entry *he, char *bf,
 	return repsep_snprintf(bf, size, "%-*llu", width, he->weight);
 }
 
-struct sort_entry sort_local_weight = {
+static struct sort_entry sort_local_weight = {
 	.se_header	= "Local Weight",
 	.se_cmp		= sort__weight_cmp,
 	.se_snprintf	= hist_entry__local_weight_snprintf,
@@ -1952,7 +1952,7 @@ static int hist_entry__global_weight_snprintf(struct hist_entry *he, char *bf,
 			       he->weight * he->stat.nr_events);
 }
 
-struct sort_entry sort_global_weight = {
+static struct sort_entry sort_global_weight = {
 	.se_header	= "Weight",
 	.se_cmp		= sort__weight_cmp,
 	.se_snprintf	= hist_entry__global_weight_snprintf,
@@ -1971,7 +1971,7 @@ static int hist_entry__local_ins_lat_snprintf(struct hist_entry *he, char *bf,
 	return repsep_snprintf(bf, size, "%-*u", width, he->ins_lat);
 }
 
-struct sort_entry sort_local_ins_lat = {
+static struct sort_entry sort_local_ins_lat = {
 	.se_header	= "Local INSTR Latency",
 	.se_cmp		= sort__ins_lat_cmp,
 	.se_snprintf	= hist_entry__local_ins_lat_snprintf,
@@ -1985,7 +1985,7 @@ static int hist_entry__global_ins_lat_snprintf(struct hist_entry *he, char *bf,
 			       he->ins_lat * he->stat.nr_events);
 }
 
-struct sort_entry sort_global_ins_lat = {
+static struct sort_entry sort_global_ins_lat = {
 	.se_header	= "INSTR Latency",
 	.se_cmp		= sort__ins_lat_cmp,
 	.se_snprintf	= hist_entry__global_ins_lat_snprintf,
@@ -2011,70 +2011,70 @@ static int hist_entry__p_stage_cyc_snprintf(struct hist_entry *he, char *bf,
 	return repsep_snprintf(bf, size, "%-*u", width, he->weight3);
 }
 
-struct sort_entry sort_local_p_stage_cyc = {
+static struct sort_entry sort_local_p_stage_cyc = {
 	.se_header      = "Local Pipeline Stage Cycle",
 	.se_cmp         = sort__p_stage_cyc_cmp,
 	.se_snprintf	= hist_entry__p_stage_cyc_snprintf,
 	.se_width_idx	= HISTC_LOCAL_P_STAGE_CYC,
 };
 
-struct sort_entry sort_global_p_stage_cyc = {
+static struct sort_entry sort_global_p_stage_cyc = {
 	.se_header      = "Pipeline Stage Cycle",
 	.se_cmp         = sort__p_stage_cyc_cmp,
 	.se_snprintf    = hist_entry__global_p_stage_cyc_snprintf,
 	.se_width_idx   = HISTC_GLOBAL_P_STAGE_CYC,
 };
 
-struct sort_entry sort_mem_daddr_sym = {
+static struct sort_entry sort_mem_daddr_sym = {
 	.se_header	= "Data Symbol",
 	.se_cmp		= sort__daddr_cmp,
 	.se_snprintf	= hist_entry__daddr_snprintf,
 	.se_width_idx	= HISTC_MEM_DADDR_SYMBOL,
 };
 
-struct sort_entry sort_mem_iaddr_sym = {
+static struct sort_entry sort_mem_iaddr_sym = {
 	.se_header	= "Code Symbol",
 	.se_cmp		= sort__iaddr_cmp,
 	.se_snprintf	= hist_entry__iaddr_snprintf,
 	.se_width_idx	= HISTC_MEM_IADDR_SYMBOL,
 };
 
-struct sort_entry sort_mem_daddr_dso = {
+static struct sort_entry sort_mem_daddr_dso = {
 	.se_header	= "Data Object",
 	.se_cmp		= sort__dso_daddr_cmp,
 	.se_snprintf	= hist_entry__dso_daddr_snprintf,
 	.se_width_idx	= HISTC_MEM_DADDR_DSO,
 };
 
-struct sort_entry sort_mem_locked = {
+static struct sort_entry sort_mem_locked = {
 	.se_header	= "Locked",
 	.se_cmp		= sort__locked_cmp,
 	.se_snprintf	= hist_entry__locked_snprintf,
 	.se_width_idx	= HISTC_MEM_LOCKED,
 };
 
-struct sort_entry sort_mem_tlb = {
+static struct sort_entry sort_mem_tlb = {
 	.se_header	= "TLB access",
 	.se_cmp		= sort__tlb_cmp,
 	.se_snprintf	= hist_entry__tlb_snprintf,
 	.se_width_idx	= HISTC_MEM_TLB,
 };
 
-struct sort_entry sort_mem_lvl = {
+static struct sort_entry sort_mem_lvl = {
 	.se_header	= "Memory access",
 	.se_cmp		= sort__lvl_cmp,
 	.se_snprintf	= hist_entry__lvl_snprintf,
 	.se_width_idx	= HISTC_MEM_LVL,
 };
 
-struct sort_entry sort_mem_snoop = {
+static struct sort_entry sort_mem_snoop = {
 	.se_header	= "Snoop",
 	.se_cmp		= sort__snoop_cmp,
 	.se_snprintf	= hist_entry__snoop_snprintf,
 	.se_width_idx	= HISTC_MEM_SNOOP,
 };
 
-struct sort_entry sort_mem_dcacheline = {
+static struct sort_entry sort_mem_dcacheline = {
 	.se_header	= "Data Cacheline",
 	.se_cmp		= sort__dcacheline_cmp,
 	.se_snprintf	= hist_entry__dcacheline_snprintf,
@@ -2109,7 +2109,7 @@ static int hist_entry__blocked_snprintf(struct hist_entry *he, char *bf,
 	return repsep_snprintf(bf, size, "%.*s", width, out);
 }
 
-struct sort_entry sort_mem_blocked = {
+static struct sort_entry sort_mem_blocked = {
 	.se_header	= "Blocked",
 	.se_cmp		= sort__blocked_cmp,
 	.se_snprintf	= hist_entry__blocked_snprintf,
@@ -2150,7 +2150,7 @@ static int hist_entry__phys_daddr_snprintf(struct hist_entry *he, char *bf,
 	return width;
 }
 
-struct sort_entry sort_mem_phys_daddr = {
+static struct sort_entry sort_mem_phys_daddr = {
 	.se_header	= "Data Physical Address",
 	.se_cmp		= sort__phys_daddr_cmp,
 	.se_snprintf	= hist_entry__phys_daddr_snprintf,
@@ -2179,7 +2179,7 @@ static int hist_entry__data_page_size_snprintf(struct hist_entry *he, char *bf,
 			get_page_size_name(mem_info__daddr(he->mem_info)->data_page_size, str));
 }
 
-struct sort_entry sort_mem_data_page_size = {
+static struct sort_entry sort_mem_data_page_size = {
 	.se_header	= "Data Page Size",
 	.se_cmp		= sort__data_page_size_cmp,
 	.se_snprintf	= hist_entry__data_page_size_snprintf,
@@ -2204,7 +2204,7 @@ static int hist_entry__code_page_size_snprintf(struct hist_entry *he, char *bf,
 			       get_page_size_name(he->code_page_size, str));
 }
 
-struct sort_entry sort_code_page_size = {
+static struct sort_entry sort_code_page_size = {
 	.se_header	= "Code Page Size",
 	.se_cmp		= sort__code_page_size_cmp,
 	.se_snprintf	= hist_entry__code_page_size_snprintf,
@@ -2236,7 +2236,7 @@ static int hist_entry__abort_snprintf(struct hist_entry *he, char *bf,
 	return repsep_snprintf(bf, size, "%-*s", width, out);
 }
 
-struct sort_entry sort_abort = {
+static struct sort_entry sort_abort = {
 	.se_header	= "Transaction abort",
 	.se_cmp		= sort__abort_cmp,
 	.se_snprintf	= hist_entry__abort_snprintf,
@@ -2268,7 +2268,7 @@ static int hist_entry__in_tx_snprintf(struct hist_entry *he, char *bf,
 	return repsep_snprintf(bf, size, "%-*s", width, out);
 }
 
-struct sort_entry sort_in_tx = {
+static struct sort_entry sort_in_tx = {
 	.se_header	= "Branch in transaction",
 	.se_cmp		= sort__in_tx_cmp,
 	.se_snprintf	= hist_entry__in_tx_snprintf,
@@ -2340,7 +2340,7 @@ static int hist_entry__transaction_snprintf(struct hist_entry *he, char *bf,
 	return repsep_snprintf(bf, size, "%-*s", width, buf);
 }
 
-struct sort_entry sort_transaction = {
+static struct sort_entry sort_transaction = {
 	.se_header	= "Transaction                ",
 	.se_cmp		= sort__transaction_cmp,
 	.se_snprintf	= hist_entry__transaction_snprintf,
@@ -2379,7 +2379,7 @@ static int hist_entry__sym_size_snprintf(struct hist_entry *he, char *bf,
 	return _hist_entry__sym_size_snprintf(he->ms.sym, bf, size, width);
 }
 
-struct sort_entry sort_sym_size = {
+static struct sort_entry sort_sym_size = {
 	.se_header	= "Symbol size",
 	.se_cmp		= sort__sym_size_cmp,
 	.se_snprintf	= hist_entry__sym_size_snprintf,
@@ -2418,7 +2418,7 @@ static int hist_entry__dso_size_snprintf(struct hist_entry *he, char *bf,
 	return _hist_entry__dso_size_snprintf(he->ms.map, bf, size, width);
 }
 
-struct sort_entry sort_dso_size = {
+static struct sort_entry sort_dso_size = {
 	.se_header	= "DSO size",
 	.se_cmp		= sort__dso_size_cmp,
 	.se_snprintf	= hist_entry__dso_size_snprintf,
@@ -2455,7 +2455,7 @@ static int hist_entry__addr_snprintf(struct hist_entry *he, char *bf,
 	return repsep_snprintf(bf, size, "%-#*llx", width, ip);
 }
 
-struct sort_entry sort_addr = {
+static struct sort_entry sort_addr = {
 	.se_header	= "Address",
 	.se_cmp		= sort__addr_cmp,
 	.se_snprintf	= hist_entry__addr_snprintf,
@@ -2573,7 +2573,7 @@ static int hist_entry__typeoff_snprintf(struct hist_entry *he, char *bf,
 			       he->mem_type_off, buf);
 }
 
-struct sort_entry sort_type_offset = {
+static struct sort_entry sort_type_offset = {
 	.se_header	= "Data Type Offset",
 	.se_cmp		= sort__type_cmp,
 	.se_collapse	= sort__typeoff_sort,
@@ -2645,7 +2645,7 @@ static int hist_entry__typecln_snprintf(struct hist_entry *he, char *bf,
 			       he->mem_type_off / cln_size);
 }
 
-struct sort_entry sort_type_cacheline = {
+static struct sort_entry sort_type_cacheline = {
 	.se_header	= "Data Type Cacheline",
 	.se_cmp		= sort__type_cmp,
 	.se_collapse	= sort__typecln_sort,
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index 8b893de35f77..c5fee8e39480 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -77,8 +77,6 @@ bool sysctl__nmi_watchdog_enabled(void)
 	return nmi_watchdog;
 }
 
-bool test_attr__enabled;
-
 bool exclude_GH_default;
 
 bool perf_host  = true;

From c89f35def821874d993bb1c033a7c3cbd32bccdb Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 8 Apr 2026 14:31:59 -0300
Subject: [PATCH 102/131] perf bench: Constify tables

Those tables and variables don't change, better capture this by
explicitely using 'const'.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/builtin-bench.c | 42 +++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c
index 02dea1b88228..02d47913cc6a 100644
--- a/tools/perf/builtin-bench.c
+++ b/tools/perf/builtin-bench.c
@@ -37,14 +37,14 @@ struct bench {
 };
 
 #ifdef HAVE_LIBNUMA_SUPPORT
-static struct bench numa_benchmarks[] = {
+static const struct bench numa_benchmarks[] = {
 	{ "mem",	"Benchmark for NUMA workloads",			bench_numa		},
 	{ "all",	"Run all NUMA benchmarks",			NULL			},
 	{ NULL,		NULL,						NULL			}
 };
 #endif
 
-static struct bench sched_benchmarks[] = {
+static const struct bench sched_benchmarks[] = {
 	{ "messaging",	"Benchmark for scheduling and IPC",		bench_sched_messaging	},
 	{ "pipe",	"Benchmark for pipe() between two processes",	bench_sched_pipe	},
 	{ "seccomp-notify",	"Benchmark for seccomp user notify",	bench_sched_seccomp_notify},
@@ -52,7 +52,7 @@ static struct bench sched_benchmarks[] = {
 	{ NULL,		NULL,						NULL			}
 };
 
-static struct bench syscall_benchmarks[] = {
+static const struct bench syscall_benchmarks[] = {
 	{ "basic",	"Benchmark for basic getppid(2) calls",		bench_syscall_basic	},
 	{ "getpgid",	"Benchmark for getpgid(2) calls",		bench_syscall_getpgid	},
 	{ "fork",	"Benchmark for fork(2) calls",			bench_syscall_fork	},
@@ -61,7 +61,7 @@ static struct bench syscall_benchmarks[] = {
 	{ NULL,		NULL,						NULL			},
 };
 
-static struct bench mem_benchmarks[] = {
+static const struct bench mem_benchmarks[] = {
 	{ "memcpy",	"Benchmark for memcpy() functions",		bench_mem_memcpy	},
 	{ "memset",	"Benchmark for memset() functions",		bench_mem_memset	},
 	{ "find_bit",	"Benchmark for find_bit() functions",		bench_mem_find_bit	},
@@ -70,7 +70,7 @@ static struct bench mem_benchmarks[] = {
 	{ NULL,		NULL,						NULL			}
 };
 
-static struct bench futex_benchmarks[] = {
+static const struct bench futex_benchmarks[] = {
 	{ "hash",	"Benchmark for futex hash table",               bench_futex_hash	},
 	{ "wake",	"Benchmark for futex wake calls",               bench_futex_wake	},
 	{ "wake-parallel", "Benchmark for parallel futex wake calls",   bench_futex_wake_parallel },
@@ -82,7 +82,7 @@ static struct bench futex_benchmarks[] = {
 };
 
 #ifdef HAVE_EVENTFD_SUPPORT
-static struct bench epoll_benchmarks[] = {
+static const struct bench epoll_benchmarks[] = {
 	{ "wait",	"Benchmark epoll concurrent epoll_waits",       bench_epoll_wait	},
 	{ "ctl",	"Benchmark epoll concurrent epoll_ctls",        bench_epoll_ctl		},
 	{ "all",	"Run all futex benchmarks",			NULL			},
@@ -90,7 +90,7 @@ static struct bench epoll_benchmarks[] = {
 };
 #endif // HAVE_EVENTFD_SUPPORT
 
-static struct bench internals_benchmarks[] = {
+static const struct bench internals_benchmarks[] = {
 	{ "synthesize", "Benchmark perf event synthesis",	bench_synthesize	},
 	{ "kallsyms-parse", "Benchmark kallsyms parsing",	bench_kallsyms_parse	},
 	{ "inject-build-id", "Benchmark build-id injection",	bench_inject_build_id	},
@@ -99,14 +99,14 @@ static struct bench internals_benchmarks[] = {
 	{ NULL,		NULL,					NULL			}
 };
 
-static struct bench breakpoint_benchmarks[] = {
+static const struct bench breakpoint_benchmarks[] = {
 	{ "thread", "Benchmark thread start/finish with breakpoints", bench_breakpoint_thread},
 	{ "enable", "Benchmark breakpoint enable/disable", bench_breakpoint_enable},
 	{ "all", "Run all breakpoint benchmarks", NULL},
 	{ NULL,	NULL, NULL },
 };
 
-static struct bench uprobe_benchmarks[] = {
+static const struct bench uprobe_benchmarks[] = {
 	{ "baseline",	"Baseline libc usleep(1000) call",				bench_uprobe_baseline,	},
 	{ "empty",	"Attach empty BPF prog to uprobe on usleep, system wide",	bench_uprobe_empty,	},
 	{ "trace_printk", "Attach trace_printk BPF prog to uprobe on usleep syswide",	bench_uprobe_trace_printk,	},
@@ -116,12 +116,12 @@ static struct bench uprobe_benchmarks[] = {
 };
 
 struct collection {
-	const char	*name;
-	const char	*summary;
-	struct bench	*benchmarks;
+	const char		*name;
+	const char		*summary;
+	const struct bench	*benchmarks;
 };
 
-static struct collection collections[] = {
+static const struct collection collections[] = {
 	{ "sched",	"Scheduler and IPC benchmarks",			sched_benchmarks	},
 	{ "syscall",	"System call benchmarks",			syscall_benchmarks	},
 	{ "mem",	"Memory access benchmarks",			mem_benchmarks		},
@@ -147,9 +147,9 @@ static struct collection collections[] = {
 #define for_each_bench(coll, bench) \
 	for (bench = coll->benchmarks; bench && bench->name; bench++)
 
-static void dump_benchmarks(struct collection *coll)
+static void dump_benchmarks(const struct collection *coll)
 {
-	struct bench *bench;
+	const struct bench *bench;
 
 	printf("\n        # List of available benchmarks for collection '%s':\n\n", coll->name);
 
@@ -178,7 +178,7 @@ static const char * const bench_usage[] = {
 
 static void print_usage(void)
 {
-	struct collection *coll;
+	const struct collection *coll;
 	int i;
 
 	printf("Usage: \n");
@@ -234,9 +234,9 @@ static int run_bench(const char *coll_name, const char *bench_name, bench_fn_t f
 	return ret;
 }
 
-static void run_collection(struct collection *coll)
+static void run_collection(const struct collection *coll)
 {
-	struct bench *bench;
+	const struct bench *bench;
 	const char *argv[2];
 
 	argv[1] = NULL;
@@ -260,7 +260,7 @@ static void run_collection(struct collection *coll)
 
 static void run_all_collections(void)
 {
-	struct collection *coll;
+	const struct collection *coll;
 
 	for_each_collection(coll)
 		run_collection(coll);
@@ -268,7 +268,7 @@ static void run_all_collections(void)
 
 int cmd_bench(int argc, const char **argv)
 {
-	struct collection *coll;
+	const struct collection *coll;
 	int ret = 0;
 
 	/* Unbuffered output */
@@ -306,7 +306,7 @@ int cmd_bench(int argc, const char **argv)
 	}
 
 	for_each_collection(coll) {
-		struct bench *bench;
+		const struct bench *bench;
 
 		if (strcmp(coll->name, argv[0]))
 			continue;

From fc32ae6df83d78145391bfdaf0e213babad8e93f Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 8 Apr 2026 14:32:00 -0300
Subject: [PATCH 103/131] perf header: Use a max number of command line args

Sashiko suggests we use some reasonable max number of args to avoid
overflows when reading perf.data files, do it.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/header.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index a3b7b796639b..a18f216f77c2 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -2795,6 +2795,9 @@ process_event_desc(struct feat_fd *ff, void *data __maybe_unused)
 	return 0;
 }
 
+// Some reasonable arbitrary max for the number of command line arguments
+#define MAX_CMDLINE_NR 32768
+
 static int process_cmdline(struct feat_fd *ff, void *data __maybe_unused)
 {
 	struct perf_env *env = &ff->ph->env;
@@ -2804,6 +2807,9 @@ static int process_cmdline(struct feat_fd *ff, void *data __maybe_unused)
 	if (do_read_u32(ff, &nr))
 		return -1;
 
+	if (nr > MAX_CMDLINE_NR)
+		return -1;
+
 	env->nr_cmdline = nr;
 
 	cmdline = zalloc(ff->size + nr + 1);

From 7507abd16a05e8b191ed7bed69e075b23111c401 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 8 Apr 2026 14:32:01 -0300
Subject: [PATCH 104/131] perf header: Do validation of perf.data
 HEADER_CPU_DOMAIN_INFO

As suggested in an unrelated sashiko review:

  https://sashiko.dev/#/patchset/20260407195145.2372104-1-acme%40kernel.org

"
Could a malformed perf.data file provide out-of-bounds values for cpu and
domain?
These variables are read directly from the file and used as indices for
cd_map and cd_map[cpu]->domains without any validation against
env->nr_cpus_avail or max_sched_domains.
Similar to the issue above, this is an existing lack of validation that
becomes apparent when looking at the allocation boundaries.
"

Validate it.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/header.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index a18f216f77c2..4925e33778b9 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -3717,6 +3717,11 @@ static int process_cpu_domain_info(struct feat_fd *ff, void *data __maybe_unused
 		if (do_read_u32(ff, &cpu))
 			return -1;
 
+		if (cpu >= nra) {
+			pr_err("Invalid HEADER_CPU_DOMAIN_INFO: cpu %d >= nr_cpus_avail (%d)\n", cpu, nra);
+			return -1;
+		}
+
 		cd_map[cpu] = zalloc(sizeof(*cd_map[cpu]));
 		if (!cd_map[cpu])
 			return -1;
@@ -3736,6 +3741,12 @@ static int process_cpu_domain_info(struct feat_fd *ff, void *data __maybe_unused
 			if (do_read_u32(ff, &domain))
 				return -1;
 
+			if (domain >= max_sched_domains) {
+				pr_err("Invalid HEADER_CPU_DOMAIN_INFO: domain %d >= max_sched_domains (%d)\n",
+				       domain, max_sched_domains);
+				return -1;
+			}
+
 			d_info = zalloc(sizeof(*d_info));
 			if (!d_info)
 				return -1;

From fbfb858552fb9a4c869e22f3303c7c7365367509 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 8 Apr 2026 14:32:02 -0300
Subject: [PATCH 105/131] perf tools: Use calloc() where applicable

Instead of using zalloc(nr_entries * sizeof_entry) that is what calloc()
does.

In some places where linux/zalloc.h isn't needed, remove it, add when
needed and was getting it indirectly.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/arch/arm/util/auxtrace.c          |  6 +++---
 tools/perf/arch/powerpc/util/auxtrace.c      |  1 +
 tools/perf/arch/x86/tests/amd-ibs-period.c   |  3 +--
 tools/perf/arch/x86/tests/dwarf-unwind.c     | 11 +----------
 tools/perf/arch/x86/util/pmu.c               |  1 -
 tools/perf/bench/numa.c                      | 13 ++++---------
 tools/perf/bench/sched-messaging.c           |  2 +-
 tools/perf/builtin-annotate.c                |  1 -
 tools/perf/builtin-c2c.c                     |  6 +++---
 tools/perf/builtin-diff.c                    |  2 +-
 tools/perf/builtin-ftrace.c                  |  1 +
 tools/perf/builtin-kwork.c                   |  2 +-
 tools/perf/builtin-record.c                  | 10 +++++-----
 tools/perf/builtin-sched.c                   |  6 +++---
 tools/perf/builtin-script.c                  |  8 ++++----
 tools/perf/builtin-stat.c                    |  2 +-
 tools/perf/builtin-trace.c                   |  4 +---
 tools/perf/jvmti/libjvmti.c                  |  5 ++---
 tools/perf/tests/code-reading.c              |  1 +
 tools/perf/tests/thread-map.c                |  1 -
 tools/perf/util/annotate-arch/annotate-x86.c |  1 +
 tools/perf/util/bpf-event.c                  |  2 +-
 tools/perf/util/bpf_counter_cgroup.c         |  1 -
 tools/perf/util/data-convert-bt.c            |  2 +-
 tools/perf/util/data.c                       |  2 +-
 tools/perf/util/db-export.c                  |  1 -
 tools/perf/util/disasm.c                     |  1 +
 tools/perf/util/event.c                      |  1 -
 tools/perf/util/evlist.c                     |  3 +--
 tools/perf/util/header.c                     | 18 +++++++++---------
 tools/perf/util/hist.c                       |  2 +-
 tools/perf/util/mem2node.c                   |  2 +-
 tools/perf/util/pmus.c                       |  2 +-
 tools/perf/util/powerpc-vpadtl.c             |  1 +
 tools/perf/util/probe-event.c                | 17 ++++++++---------
 tools/perf/util/probe-file.c                 |  2 +-
 tools/perf/util/probe-finder.c               |  8 ++++----
 tools/perf/util/session.c                    |  2 +-
 tools/perf/util/srcline.c                    |  1 +
 tools/perf/util/stat-shadow.c                |  1 -
 tools/perf/util/unwind-libunwind-local.c     |  1 -
 tools/perf/util/values.c                     |  8 ++++----
 42 files changed, 72 insertions(+), 93 deletions(-)

diff --git a/tools/perf/arch/arm/util/auxtrace.c b/tools/perf/arch/arm/util/auxtrace.c
index eb6404267f17..27bb14c8b880 100644
--- a/tools/perf/arch/arm/util/auxtrace.c
+++ b/tools/perf/arch/arm/util/auxtrace.c
@@ -8,7 +8,7 @@
 #include <errno.h>
 #include <stdbool.h>
 #include <linux/coresight-pmu.h>
-#include <linux/zalloc.h>
+#include <stdlib.h>
 #include <api/fs/fs.h>
 
 #include "../../../util/auxtrace.h"
@@ -27,7 +27,7 @@ static struct perf_pmu **find_all_arm_spe_pmus(int *nr_spes, int *err)
 	/* arm_spe_xxxxxxxxx\0 */
 	char arm_spe_pmu_name[sizeof(ARM_SPE_PMU_NAME) + 10];
 
-	arm_spe_pmus = zalloc(sizeof(struct perf_pmu *) * nr_cpus);
+	arm_spe_pmus = calloc(nr_cpus, sizeof(struct perf_pmu *));
 	if (!arm_spe_pmus) {
 		pr_err("spes alloc failed\n");
 		*err = -ENOMEM;
@@ -79,7 +79,7 @@ static struct perf_pmu **find_all_hisi_ptt_pmus(int *nr_ptts, int *err)
 	if (!(*nr_ptts))
 		goto out;
 
-	hisi_ptt_pmus = zalloc(sizeof(struct perf_pmu *) * (*nr_ptts));
+	hisi_ptt_pmus = calloc((*nr_ptts), sizeof(struct perf_pmu *));
 	if (!hisi_ptt_pmus) {
 		pr_err("hisi_ptt alloc failed\n");
 		*err = -ENOMEM;
diff --git a/tools/perf/arch/powerpc/util/auxtrace.c b/tools/perf/arch/powerpc/util/auxtrace.c
index 292ea335e4ff..e39deff6c857 100644
--- a/tools/perf/arch/powerpc/util/auxtrace.c
+++ b/tools/perf/arch/powerpc/util/auxtrace.c
@@ -6,6 +6,7 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/string.h>
+#include <linux/zalloc.h>
 
 #include "../../util/evlist.h"
 #include "../../util/debug.h"
diff --git a/tools/perf/arch/x86/tests/amd-ibs-period.c b/tools/perf/arch/x86/tests/amd-ibs-period.c
index 223e059e04de..cee9e11c05e0 100644
--- a/tools/perf/arch/x86/tests/amd-ibs-period.c
+++ b/tools/perf/arch/x86/tests/amd-ibs-period.c
@@ -8,7 +8,6 @@
 
 #include "arch-tests.h"
 #include "linux/perf_event.h"
-#include "linux/zalloc.h"
 #include "tests/tests.h"
 #include "../perf-sys.h"
 #include "pmu.h"
@@ -60,7 +59,7 @@ static int dummy_workload_1(unsigned long count)
 		0xcc, /* int 3 */
 	};
 
-	p = zalloc(2 * page_size);
+	p = calloc(2, page_size);
 	if (!p) {
 		printf("malloc() failed. %m");
 		return 1;
diff --git a/tools/perf/arch/x86/tests/dwarf-unwind.c b/tools/perf/arch/x86/tests/dwarf-unwind.c
index e91a73d09cec..99d2b7ed016f 100644
--- a/tools/perf/arch/x86/tests/dwarf-unwind.c
+++ b/tools/perf/arch/x86/tests/dwarf-unwind.c
@@ -54,22 +54,13 @@ int test__arch_unwind_sample(struct perf_sample *sample,
 			     struct thread *thread)
 {
 	struct regs_dump *regs = perf_sample__user_regs(sample);
-	u64 *buf;
+	u64 *buf = calloc(PERF_REGS_MAX, sizeof(u64));
 
-	buf = malloc(sizeof(u64) * PERF_REGS_MAX);
 	if (!buf) {
 		pr_debug("failed to allocate sample uregs data\n");
 		return -1;
 	}
 
-#ifdef MEMORY_SANITIZER
-	/*
-	 * Assignments to buf in the assembly function perf_regs_load aren't
-	 * seen by memory sanitizer. Zero the memory to convince memory
-	 * sanitizer the memory is initialized.
-	 */
-	memset(buf, 0, sizeof(u64) * PERF_REGS_MAX);
-#endif
 	perf_regs_load(buf);
 	regs->abi  = PERF_SAMPLE_REGS_ABI;
 	regs->regs = buf;
diff --git a/tools/perf/arch/x86/util/pmu.c b/tools/perf/arch/x86/util/pmu.c
index 0661e0f0b02d..7c9d238922a6 100644
--- a/tools/perf/arch/x86/util/pmu.c
+++ b/tools/perf/arch/x86/util/pmu.c
@@ -7,7 +7,6 @@
 #include <linux/stddef.h>
 #include <linux/string.h>
 #include <linux/perf_event.h>
-#include <linux/zalloc.h>
 #include <api/fs/fs.h>
 #include <api/io_dir.h>
 #include <internal/cpumap.h>
diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c
index 6588a9b0b15a..42d7afc03f9b 100644
--- a/tools/perf/bench/numa.c
+++ b/tools/perf/bench/numa.c
@@ -32,7 +32,6 @@
 #include <linux/kernel.h>
 #include <linux/time64.h>
 #include <linux/numa.h>
-#include <linux/zalloc.h>
 
 #include "../util/header.h"
 #include "../util/mutex.h"
@@ -980,10 +979,8 @@ static int count_process_nodes(int process_nr)
 	int nodes;
 	int n, t;
 
-	node_present = (char *)malloc(g->p.nr_nodes * sizeof(char));
+	node_present = calloc(g->p.nr_nodes, sizeof(char));
 	BUG_ON(!node_present);
-	for (nodes = 0; nodes < g->p.nr_nodes; nodes++)
-		node_present[nodes] = 0;
 
 	for (t = 0; t < g->p.nr_threads; t++) {
 		struct thread_data *td;
@@ -1090,10 +1087,8 @@ static void calc_convergence(double runtime_ns_max, double *convergence)
 	if (!g->p.show_convergence && !g->p.measure_convergence)
 		return;
 
-	nodes = (int *)malloc(g->p.nr_nodes * sizeof(int));
+	nodes = calloc(g->p.nr_nodes, sizeof(int));
 	BUG_ON(!nodes);
-	for (node = 0; node < g->p.nr_nodes; node++)
-		nodes[node] = 0;
 
 	loops_done_min = -1;
 	loops_done_max = 0;
@@ -1423,7 +1418,7 @@ static void worker_process(int process_nr)
 	bind_to_memnode(td->bind_node);
 	bind_to_cpumask(td->bind_cpumask);
 
-	pthreads = zalloc(g->p.nr_threads * sizeof(pthread_t));
+	pthreads = calloc(g->p.nr_threads, sizeof(pthread_t));
 	process_data = setup_private_data(g->p.bytes_process);
 
 	if (g->p.show_details >= 3) {
@@ -1629,7 +1624,7 @@ static int __bench_numa(const char *name)
 	if (init())
 		return -1;
 
-	pids = zalloc(g->p.nr_proc * sizeof(*pids));
+	pids = calloc(g->p.nr_proc, sizeof(*pids));
 	pid = -1;
 
 	if (g->p.serialize_startup) {
diff --git a/tools/perf/bench/sched-messaging.c b/tools/perf/bench/sched-messaging.c
index 93dcd9dba3d0..4fb6657fc826 100644
--- a/tools/perf/bench/sched-messaging.c
+++ b/tools/perf/bench/sched-messaging.c
@@ -301,7 +301,7 @@ int bench_sched_messaging(int argc, const char **argv)
 	argc = parse_options(argc, argv, options,
 			     bench_sched_message_usage, 0);
 
-	worker_tab = malloc(num_fds * 2 * num_groups * sizeof(union messaging_worker));
+	worker_tab = calloc(num_fds * 2 * num_groups, sizeof(union messaging_worker));
 	if (!worker_tab)
 		err(EXIT_FAILURE, "main:malloc()");
 
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 530348b6981b..5e57b78548f4 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -13,7 +13,6 @@
 #include <linux/list.h>
 #include "util/cache.h"
 #include <linux/rbtree.h>
-#include <linux/zalloc.h>
 #include "util/symbol.h"
 
 #include "util/debug.h"
diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c
index 3ce5f0adec2f..72a7802775ee 100644
--- a/tools/perf/builtin-c2c.c
+++ b/tools/perf/builtin-c2c.c
@@ -155,7 +155,7 @@ static void *c2c_he_zalloc(size_t size)
 	if (!c2c_he->nodeset)
 		goto out_free;
 
-	c2c_he->node_stats = zalloc(c2c.nodes_cnt * sizeof(*c2c_he->node_stats));
+	c2c_he->node_stats = calloc(c2c.nodes_cnt, sizeof(*c2c_he->node_stats));
 	if (!c2c_he->node_stats)
 		goto out_free;
 
@@ -2324,13 +2324,13 @@ static int setup_nodes(struct perf_session *session)
 	if (!n)
 		return -EINVAL;
 
-	nodes = zalloc(sizeof(unsigned long *) * c2c.nodes_cnt);
+	nodes = calloc(c2c.nodes_cnt, sizeof(unsigned long *));
 	if (!nodes)
 		return -ENOMEM;
 
 	c2c.nodes = nodes;
 
-	cpu2node = zalloc(sizeof(int) * c2c.cpus_cnt);
+	cpu2node = calloc(c2c.cpus_cnt, sizeof(int));
 	if (!cpu2node)
 		return -ENOMEM;
 
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 2c59e43901fe..1b3df868849a 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -1891,7 +1891,7 @@ static int data_init(int argc, const char **argv)
 		return -EINVAL;
 	}
 
-	data__files = zalloc(sizeof(*data__files) * data__files_cnt);
+	data__files = calloc(data__files_cnt, sizeof(*data__files));
 	if (!data__files)
 		return -ENOMEM;
 
diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c
index 4cc33452d79b..8a7dbfb14535 100644
--- a/tools/perf/builtin-ftrace.c
+++ b/tools/perf/builtin-ftrace.c
@@ -20,6 +20,7 @@
 #include <linux/capability.h>
 #include <linux/err.h>
 #include <linux/string.h>
+#include <linux/zalloc.h>
 #include <sys/stat.h>
 
 #include "debug.h"
diff --git a/tools/perf/builtin-kwork.c b/tools/perf/builtin-kwork.c
index 1140e00e874f..9d3a4c779a41 100644
--- a/tools/perf/builtin-kwork.c
+++ b/tools/perf/builtin-kwork.c
@@ -2208,7 +2208,7 @@ static int perf_kwork__top(struct perf_kwork *kwork)
 	struct __top_cpus_runtime *cpus_runtime;
 	int ret = 0;
 
-	cpus_runtime = zalloc(sizeof(struct __top_cpus_runtime) * (MAX_NR_CPUS + 1));
+	cpus_runtime = calloc(MAX_NR_CPUS + 1, sizeof(struct __top_cpus_runtime));
 	if (!cpus_runtime)
 		return -1;
 
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index e919d1f021c3..1adc37b45152 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -1070,12 +1070,12 @@ static int record__thread_data_init_maps(struct record_thread *thread_data, stru
 		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
 						      thread_data->mask->maps.nbits);
 	if (mmap) {
-		thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
+		thread_data->maps = calloc(thread_data->nr_mmaps, sizeof(struct mmap *));
 		if (!thread_data->maps)
 			return -ENOMEM;
 	}
 	if (overwrite_mmap) {
-		thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
+		thread_data->overwrite_maps = calloc(thread_data->nr_mmaps, sizeof(struct mmap *));
 		if (!thread_data->overwrite_maps) {
 			zfree(&thread_data->maps);
 			return -ENOMEM;
@@ -1220,7 +1220,7 @@ static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
 	int t, ret;
 	struct record_thread *thread_data;
 
-	rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
+	rec->thread_data = calloc(rec->nr_threads, sizeof(*(rec->thread_data)));
 	if (!rec->thread_data) {
 		pr_err("Failed to allocate thread data\n");
 		return -ENOMEM;
@@ -3710,7 +3710,7 @@ static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr
 {
 	int t, ret;
 
-	rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
+	rec->thread_masks = calloc(nr_threads, sizeof(*(rec->thread_masks)));
 	if (!rec->thread_masks) {
 		pr_err("Failed to allocate thread masks\n");
 		return -ENOMEM;
@@ -3920,7 +3920,7 @@ static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_ma
 		return -ENOMEM;
 	}
 
-	spec = zalloc(topo->nr * sizeof(char *));
+	spec = calloc(topo->nr, sizeof(char *));
 	if (!spec) {
 		pr_err("Failed to allocate NUMA spec\n");
 		ret = -ENOMEM;
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 9fb5447f9014..555247568e7a 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -2405,7 +2405,7 @@ static int init_idle_threads(int ncpu)
 {
 	int i, ret;
 
-	idle_threads = zalloc(ncpu * sizeof(struct thread *));
+	idle_threads = calloc(ncpu, sizeof(struct thread *));
 	if (!idle_threads)
 		return -ENOMEM;
 
@@ -3483,7 +3483,7 @@ static int setup_cpus_switch_event(struct perf_sched *sched)
 	if (!sched->cpu_last_switched)
 		return -1;
 
-	sched->curr_pid = malloc(MAX_CPUS * sizeof(*(sched->curr_pid)));
+	sched->curr_pid = calloc(MAX_CPUS, sizeof(*(sched->curr_pid)));
 	if (!sched->curr_pid) {
 		zfree(&sched->cpu_last_switched);
 		return -1;
@@ -3559,7 +3559,7 @@ static int setup_map_cpus(struct perf_sched *sched)
 	sched->max_cpu.cpu  = sysconf(_SC_NPROCESSORS_CONF);
 
 	if (sched->map.comp) {
-		sched->map.comp_cpus = zalloc(sched->max_cpu.cpu * sizeof(int));
+		sched->map.comp_cpus = calloc(sched->max_cpu.cpu, sizeof(int));
 		if (!sched->map.comp_cpus)
 			return -1;
 	}
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 43ce119dac3e..c8ac9f01a36b 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -3823,7 +3823,7 @@ out:
 
 static int have_cmd(int argc, const char **argv)
 {
-	char **__argv = malloc(sizeof(const char *) * argc);
+	char **__argv = calloc(argc, sizeof(const char *));
 
 	if (!__argv) {
 		pr_err("malloc failed\n");
@@ -4312,7 +4312,7 @@ int cmd_script(int argc, const char **argv)
 				}
 			}
 
-			__argv = malloc((argc + 6) * sizeof(const char *));
+			__argv = calloc(argc + 6, sizeof(const char *));
 			if (!__argv) {
 				pr_err("malloc failed\n");
 				err = -ENOMEM;
@@ -4338,7 +4338,7 @@ int cmd_script(int argc, const char **argv)
 		dup2(live_pipe[0], 0);
 		close(live_pipe[1]);
 
-		__argv = malloc((argc + 4) * sizeof(const char *));
+		__argv = calloc(argc + 4, sizeof(const char *));
 		if (!__argv) {
 			pr_err("malloc failed\n");
 			err = -ENOMEM;
@@ -4376,7 +4376,7 @@ script_found:
 			}
 		}
 
-		__argv = malloc((argc + 2) * sizeof(const char *));
+		__argv = calloc(argc + 2, sizeof(const char *));
 		if (!__argv) {
 			pr_err("malloc failed\n");
 			err = -ENOMEM;
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 35934e8bbd51..99d7db372b48 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -2774,7 +2774,7 @@ int cmd_stat(int argc, const char **argv)
 	}
 
 	if (stat_config.walltime_run_table) {
-		stat_config.walltime_run = zalloc(stat_config.run_count * sizeof(stat_config.walltime_run[0]));
+		stat_config.walltime_run = calloc(stat_config.run_count, sizeof(stat_config.walltime_run[0]));
 		if (!stat_config.walltime_run) {
 			pr_err("failed to setup -r option");
 			goto out;
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 873d144807e2..e58c49d047a2 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -2269,9 +2269,7 @@ static int trace__validate_ev_qualifier(struct trace *trace)
 	struct str_node *pos;
 	size_t nr_used = 0, nr_allocated = strlist__nr_entries(trace->ev_qualifier);
 
-	trace->ev_qualifier_ids.entries = malloc(nr_allocated *
-						 sizeof(trace->ev_qualifier_ids.entries[0]));
-
+	trace->ev_qualifier_ids.entries = calloc(nr_allocated, sizeof(trace->ev_qualifier_ids.entries[0]));
 	if (trace->ev_qualifier_ids.entries == NULL) {
 		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
 		       trace->output);
diff --git a/tools/perf/jvmti/libjvmti.c b/tools/perf/jvmti/libjvmti.c
index 87bfd4781003..d3dc53010e76 100644
--- a/tools/perf/jvmti/libjvmti.c
+++ b/tools/perf/jvmti/libjvmti.c
@@ -98,7 +98,7 @@ get_line_numbers(jvmtiEnv *jvmti, const void *compile_info, jvmti_line_info_t **
 	/*
 	 * Phase 2 -- allocate big enough line table
 	 */
-	*tab = malloc(nr_total * sizeof(**tab));
+	*tab = calloc(nr_total, sizeof(**tab));
 	if (!*tab)
 		return JVMTI_ERROR_OUT_OF_MEMORY;
 
@@ -262,11 +262,10 @@ compiled_method_load_cb(jvmtiEnv *jvmti,
 			}
 			nr_lines = 0;
 		} else if (nr_lines > 0) {
-			line_file_names = malloc(sizeof(char*) * nr_lines);
+			line_file_names = calloc(nr_lines, sizeof(char *));
 			if (!line_file_names) {
 				warnx("jvmti: cannot allocate space for line table method names");
 			} else {
-				memset(line_file_names, 0, sizeof(char*) * nr_lines);
 				ret = fill_source_filenames(jvmti, nr_lines, line_tab, line_file_names);
 				if (ret != JVMTI_ERROR_NONE) {
 					warnx("jvmti: fill_source_filenames failed");
diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c
index 5927d1ea20e2..47043a3a2fb4 100644
--- a/tools/perf/tests/code-reading.c
+++ b/tools/perf/tests/code-reading.c
@@ -4,6 +4,7 @@
 #include <linux/kernel.h>
 #include <linux/rbtree.h>
 #include <linux/types.h>
+#include <linux/zalloc.h>
 #include <inttypes.h>
 #include <stdlib.h>
 #include <unistd.h>
diff --git a/tools/perf/tests/thread-map.c b/tools/perf/tests/thread-map.c
index 54209592168d..877868107455 100644
--- a/tools/perf/tests/thread-map.c
+++ b/tools/perf/tests/thread-map.c
@@ -9,7 +9,6 @@
 #include "debug.h"
 #include "event.h"
 #include "util/synthetic-events.h"
-#include <linux/zalloc.h>
 #include <perf/event.h>
 #include <internal/threadmap.h>
 
diff --git a/tools/perf/util/annotate-arch/annotate-x86.c b/tools/perf/util/annotate-arch/annotate-x86.c
index c77aabd48eba..7e6136536393 100644
--- a/tools/perf/util/annotate-arch/annotate-x86.c
+++ b/tools/perf/util/annotate-arch/annotate-x86.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <string.h>
 #include <linux/compiler.h>
+#include <linux/zalloc.h>
 #include <assert.h>
 #include <inttypes.h>
 #include "../annotate-data.h"
diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c
index 67e7786bb878..a27945c279ef 100644
--- a/tools/perf/util/bpf-event.c
+++ b/tools/perf/util/bpf-event.c
@@ -349,7 +349,7 @@ static struct bpf_metadata *bpf_metadata_alloc(__u32 nr_prog_tags,
 	if (!metadata)
 		return NULL;
 
-	metadata->prog_names = zalloc(nr_prog_tags * sizeof(char *));
+	metadata->prog_names = calloc(nr_prog_tags, sizeof(char *));
 	if (!metadata->prog_names) {
 		bpf_metadata_free(metadata);
 		return NULL;
diff --git a/tools/perf/util/bpf_counter_cgroup.c b/tools/perf/util/bpf_counter_cgroup.c
index 5572ceccf860..519fee3dc3d0 100644
--- a/tools/perf/util/bpf_counter_cgroup.c
+++ b/tools/perf/util/bpf_counter_cgroup.c
@@ -11,7 +11,6 @@
 #include <sys/time.h>
 #include <sys/resource.h>
 #include <linux/err.h>
-#include <linux/zalloc.h>
 #include <linux/perf_event.h>
 #include <api/fs/fs.h>
 #include <bpf/bpf.h>
diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c
index bece77cbc493..3b8f2df823a9 100644
--- a/tools/perf/util/data-convert-bt.c
+++ b/tools/perf/util/data-convert-bt.c
@@ -1379,7 +1379,7 @@ static int setup_streams(struct ctf_writer *cw, struct perf_session *session)
 	 */
 	ncpus = env->nr_cpus_avail ?: MAX_CPUS;
 
-	stream = zalloc(sizeof(*stream) * ncpus);
+	stream = calloc(ncpus, sizeof(*stream));
 	if (!stream) {
 		pr_err("Failed to allocate streams.\n");
 		return -ENOMEM;
diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c
index 90df41da1a32..14fa83dae71a 100644
--- a/tools/perf/util/data.c
+++ b/tools/perf/util/data.c
@@ -43,7 +43,7 @@ int perf_data__create_dir(struct perf_data *data, int nr)
 	if (WARN_ON(!data->is_dir))
 		return -EINVAL;
 
-	files = zalloc(nr * sizeof(*files));
+	files = calloc(nr, sizeof(*files));
 	if (!files)
 		return -ENOMEM;
 
diff --git a/tools/perf/util/db-export.c b/tools/perf/util/db-export.c
index ae9a9065aab7..cc2bb1af4243 100644
--- a/tools/perf/util/db-export.c
+++ b/tools/perf/util/db-export.c
@@ -19,7 +19,6 @@
 #include "callchain.h"
 #include "call-path.h"
 #include "db-export.h"
-#include <linux/zalloc.h>
 
 int db_export__init(struct db_export *dbe)
 {
diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index 40fcaed5d0b1..4f5bd9153552 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -13,6 +13,7 @@
 #include <unistd.h>
 
 #include <linux/string.h>
+#include <linux/zalloc.h>
 #include <subcmd/run-command.h>
 
 #include "annotate.h"
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index bc045fddf7d5..66f4843bb235 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -12,7 +12,6 @@
 #include <unistd.h>
 #include <uapi/linux/mman.h> /* To get things like MAP_HUGETLB even on older libc headers */
 #include <linux/perf_event.h>
-#include <linux/zalloc.h>
 #include "cpumap.h"
 #include "dso.h"
 #include "event.h"
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index f46e1d40bad7..ee971d15b3c6 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -825,9 +825,8 @@ static struct mmap *evlist__alloc_mmap(struct evlist *evlist,
 				       bool overwrite)
 {
 	int i;
-	struct mmap *map;
+	struct mmap *map = calloc(evlist->core.nr_mmaps, sizeof(struct mmap));
 
-	map = zalloc(evlist->core.nr_mmaps * sizeof(struct mmap));
 	if (!map)
 		return NULL;
 
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 4925e33778b9..c6efddb70aee 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -2816,7 +2816,7 @@ static int process_cmdline(struct feat_fd *ff, void *data __maybe_unused)
 	if (!cmdline)
 		return -1;
 
-	argv = zalloc(sizeof(char *) * (nr + 1));
+	argv = calloc(nr + 1, sizeof(char *));
 	if (!argv)
 		goto error;
 
@@ -2970,7 +2970,7 @@ static int process_numa_topology(struct feat_fd *ff, void *data __maybe_unused)
 	if (do_read_u32(ff, &nr))
 		return -1;
 
-	nodes = zalloc(sizeof(*nodes) * nr);
+	nodes = calloc(nr, sizeof(*nodes));
 	if (!nodes)
 		return -ENOMEM;
 
@@ -3168,7 +3168,7 @@ static int process_cache(struct feat_fd *ff, void *data __maybe_unused)
 	if (do_read_u32(ff, &cnt))
 		return -1;
 
-	caches = zalloc(sizeof(*caches) * cnt);
+	caches = calloc(cnt, sizeof(*caches));
 	if (!caches)
 		return -1;
 
@@ -3260,7 +3260,7 @@ static int process_mem_topology(struct feat_fd *ff,
 	if (do_read_u64(ff, &nr))
 		return -1;
 
-	nodes = zalloc(sizeof(*nodes) * nr);
+	nodes = calloc(nr, sizeof(*nodes));
 	if (!nodes)
 		return -1;
 
@@ -3350,7 +3350,7 @@ static int process_hybrid_topology(struct feat_fd *ff,
 	if (do_read_u32(ff, &nr))
 		return -1;
 
-	nodes = zalloc(sizeof(*nodes) * nr);
+	nodes = calloc(nr, sizeof(*nodes));
 	if (!nodes)
 		return -ENOMEM;
 
@@ -3565,7 +3565,7 @@ static int __process_pmu_caps(struct feat_fd *ff, int *nr_caps,
 	if (!nr_pmu_caps)
 		return 0;
 
-	*caps = zalloc(sizeof(char *) * nr_pmu_caps);
+	*caps = calloc(nr_pmu_caps, sizeof(char *));
 	if (!*caps)
 		return -1;
 
@@ -3642,7 +3642,7 @@ static int process_pmu_caps(struct feat_fd *ff, void *data __maybe_unused)
 		return 0;
 	}
 
-	pmu_caps = zalloc(sizeof(*pmu_caps) * nr_pmu);
+	pmu_caps = calloc(nr_pmu, sizeof(*pmu_caps));
 	if (!pmu_caps)
 		return -ENOMEM;
 
@@ -3695,7 +3695,7 @@ static int process_cpu_domain_info(struct feat_fd *ff, void *data __maybe_unused
 	nra = env->nr_cpus_avail;
 	nr = env->nr_cpus_online;
 
-	cd_map = zalloc(sizeof(*cd_map) * nra);
+	cd_map = calloc(nra, sizeof(*cd_map));
 	if (!cd_map)
 		return -1;
 
@@ -3733,7 +3733,7 @@ static int process_cpu_domain_info(struct feat_fd *ff, void *data __maybe_unused
 
 		cd_map[cpu]->nr_domains = nr_domains;
 
-		cd_map[cpu]->domains = zalloc(sizeof(*d_info) * max_sched_domains);
+		cd_map[cpu]->domains = calloc(max_sched_domains, sizeof(*d_info));
 		if (!cd_map[cpu]->domains)
 			return -1;
 
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index fc737a0a8e4d..747fdc455c80 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1151,7 +1151,7 @@ iter_prepare_cumulative_entry(struct hist_entry_iter *iter,
 	 * cumulated only one time to prevent entries more than 100%
 	 * overhead.
 	 */
-	he_cache = malloc(sizeof(*he_cache) * (cursor->nr + 1));
+	he_cache = calloc(cursor->nr + 1, sizeof(*he_cache));
 	if (he_cache == NULL)
 		return -ENOMEM;
 
diff --git a/tools/perf/util/mem2node.c b/tools/perf/util/mem2node.c
index 03a7d7b27737..51a2292cbf7e 100644
--- a/tools/perf/util/mem2node.c
+++ b/tools/perf/util/mem2node.c
@@ -59,7 +59,7 @@ int mem2node__init(struct mem2node *map, struct perf_env *env)
 		max += bitmap_weight(n->set, n->size);
 	}
 
-	entries = zalloc(sizeof(*entries) * max);
+	entries = calloc(max, sizeof(*entries));
 	if (!entries)
 		return -ENOMEM;
 
diff --git a/tools/perf/util/pmus.c b/tools/perf/util/pmus.c
index 98be2eb8f1f0..9a2023ceeefd 100644
--- a/tools/perf/util/pmus.c
+++ b/tools/perf/util/pmus.c
@@ -621,7 +621,7 @@ void perf_pmus__print_pmu_events(const struct print_callbacks *print_cb, void *p
 	while ((pmu = scan_fn(pmu)) != NULL)
 		len += perf_pmu__num_events(pmu);
 
-	aliases = zalloc(sizeof(struct sevent) * len);
+	aliases = calloc(len, sizeof(struct sevent));
 	if (!aliases) {
 		pr_err("FATAL: not enough memory to print PMU events\n");
 		return;
diff --git a/tools/perf/util/powerpc-vpadtl.c b/tools/perf/util/powerpc-vpadtl.c
index 993ab16614c7..710f3093f3f9 100644
--- a/tools/perf/util/powerpc-vpadtl.c
+++ b/tools/perf/util/powerpc-vpadtl.c
@@ -4,6 +4,7 @@
  */
 
 #include <linux/string.h>
+#include <linux/zalloc.h>
 #include <errno.h>
 #include <inttypes.h>
 #include "color.h"
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 710e4620923e..f37a783ea772 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -1850,7 +1850,7 @@ int parse_perf_probe_command(const char *cmd, struct perf_probe_event *pev)
 
 	/* Copy arguments and ensure return probe has no C argument */
 	pev->nargs = argc - 1;
-	pev->args = zalloc(sizeof(struct perf_probe_arg) * pev->nargs);
+	pev->args = calloc(pev->nargs, sizeof(struct perf_probe_arg));
 	if (pev->args == NULL) {
 		ret = -ENOMEM;
 		goto out;
@@ -2000,7 +2000,7 @@ int parse_probe_trace_command(const char *cmd, struct probe_trace_event *tev)
 	}
 
 	tev->nargs = argc - 2;
-	tev->args = zalloc(sizeof(struct probe_trace_arg) * tev->nargs);
+	tev->args = calloc(tev->nargs, sizeof(struct probe_trace_arg));
 	if (tev->args == NULL) {
 		ret = -ENOMEM;
 		goto out;
@@ -2373,7 +2373,7 @@ static int convert_to_perf_probe_event(struct probe_trace_event *tev,
 
 	/* Convert trace_arg to probe_arg */
 	pev->nargs = tev->nargs;
-	pev->args = zalloc(sizeof(struct perf_probe_arg) * pev->nargs);
+	pev->args = calloc(pev->nargs, sizeof(struct perf_probe_arg));
 	if (pev->args == NULL)
 		return -ENOMEM;
 	for (i = 0; i < tev->nargs && ret >= 0; i++) {
@@ -2480,7 +2480,7 @@ int perf_probe_event__copy(struct perf_probe_event *dst,
 	if (perf_probe_point__copy(&dst->point, &src->point) < 0)
 		goto out_err;
 
-	dst->args = zalloc(sizeof(struct perf_probe_arg) * src->nargs);
+	dst->args = calloc(src->nargs, sizeof(struct perf_probe_arg));
 	if (!dst->args)
 		goto out_err;
 	dst->nargs = src->nargs;
@@ -3179,7 +3179,7 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
 	}
 
 	/* Setup result trace-probe-events */
-	*tevs = zalloc(sizeof(*tev) * num_matched_functions);
+	*tevs = calloc(num_matched_functions, sizeof(*tev));
 	if (!*tevs) {
 		ret = -ENOMEM;
 		goto out;
@@ -3251,8 +3251,7 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
 		tev->uprobes = pev->uprobes;
 		tev->nargs = pev->nargs;
 		if (tev->nargs) {
-			tev->args = zalloc(sizeof(struct probe_trace_arg) *
-					   tev->nargs);
+			tev->args = calloc(tev->nargs, sizeof(struct probe_trace_arg));
 			if (tev->args == NULL)
 				goto nomem_out;
 		}
@@ -3363,7 +3362,7 @@ static int try_to_find_absolute_address(struct perf_probe_event *pev,
 	}
 
 	tev->nargs = pev->nargs;
-	tev->args = zalloc(sizeof(struct probe_trace_arg) * tev->nargs);
+	tev->args = calloc(tev->nargs, sizeof(struct probe_trace_arg));
 	if (!tev->args)
 		goto errout;
 
@@ -3549,7 +3548,7 @@ static int find_probe_trace_events_from_cache(struct perf_probe_event *pev,
 		goto out;
 	}
 
-	*tevs = zalloc(ret * sizeof(*tev));
+	*tevs = calloc(ret, sizeof(*tev));
 	if (!*tevs) {
 		ret = -ENOMEM;
 		goto out;
diff --git a/tools/perf/util/probe-file.c b/tools/perf/util/probe-file.c
index f78c3bc3d601..4032572cbf55 100644
--- a/tools/perf/util/probe-file.c
+++ b/tools/perf/util/probe-file.c
@@ -414,7 +414,7 @@ int probe_cache_entry__get_event(struct probe_cache_entry *entry,
 	if (ret > probe_conf.max_probes)
 		return -E2BIG;
 
-	*tevs = zalloc(ret * sizeof(*tev));
+	*tevs = calloc(ret, sizeof(*tev));
 	if (!*tevs)
 		return -ENOMEM;
 
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 5ffd97ee4898..64328abeef8b 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -1305,7 +1305,7 @@ static int add_probe_trace_event(Dwarf_Die *sc_die, struct probe_finder *pf)
 		 tev->point.offset);
 
 	/* Expand special probe argument if exist */
-	args = zalloc(sizeof(struct perf_probe_arg) * MAX_PROBE_ARGS);
+	args = calloc(MAX_PROBE_ARGS, sizeof(struct perf_probe_arg));
 	if (args == NULL) {
 		ret = -ENOMEM;
 		goto end;
@@ -1316,7 +1316,7 @@ static int add_probe_trace_event(Dwarf_Die *sc_die, struct probe_finder *pf)
 		goto end;
 
 	tev->nargs = ret;
-	tev->args = zalloc(sizeof(struct probe_trace_arg) * tev->nargs);
+	tev->args = calloc(tev->nargs, sizeof(struct probe_trace_arg));
 	if (tev->args == NULL) {
 		ret = -ENOMEM;
 		goto end;
@@ -1393,7 +1393,7 @@ int debuginfo__find_trace_events(struct debuginfo *dbg,
 	int ret, i;
 
 	/* Allocate result tevs array */
-	*tevs = zalloc(sizeof(struct probe_trace_event) * tf.max_tevs);
+	*tevs = calloc(tf.max_tevs, sizeof(struct probe_trace_event));
 	if (*tevs == NULL)
 		return -ENOMEM;
 
@@ -1566,7 +1566,7 @@ int debuginfo__find_available_vars_at(struct debuginfo *dbg,
 	int ret;
 
 	/* Allocate result vls array */
-	*vls = zalloc(sizeof(struct variable_list) * af.max_vls);
+	*vls = calloc(af.max_vls, sizeof(struct variable_list));
 	if (*vls == NULL)
 		return -ENOMEM;
 
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 7588cca110d2..312ea05e2113 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -2559,7 +2559,7 @@ static int __perf_session__process_dir_events(struct perf_session *session)
 			nr_readers++;
 	}
 
-	rd = zalloc(nr_readers * sizeof(struct reader));
+	rd = calloc(nr_readers, sizeof(struct reader));
 	if (!rd)
 		return -ENOMEM;
 
diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c
index 9be42f398440..b58710624ead 100644
--- a/tools/perf/util/srcline.c
+++ b/tools/perf/util/srcline.c
@@ -12,6 +12,7 @@
 #include <inttypes.h>
 #include <string.h>
 #include <linux/string.h>
+#include <linux/zalloc.h>
 
 bool srcline_full_filename;
 
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index 59d2cd4f2188..bc2d44df7baf 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -13,7 +13,6 @@
 #include "metricgroup.h"
 #include "cgroup.h"
 #include "units.h"
-#include <linux/zalloc.h>
 #include "iostat.h"
 #include "util/hashmap.h"
 #include "tool_pmu.h"
diff --git a/tools/perf/util/unwind-libunwind-local.c b/tools/perf/util/unwind-libunwind-local.c
index 5b39ce21e333..87d496e9dfa6 100644
--- a/tools/perf/util/unwind-libunwind-local.c
+++ b/tools/perf/util/unwind-libunwind-local.c
@@ -25,7 +25,6 @@
 #include <unistd.h>
 #include <sys/mman.h>
 #include <linux/list.h>
-#include <linux/zalloc.h>
 #ifndef REMOTE_UNWIND_LIBUNWIND
 #include <libunwind.h>
 #include <libunwind-ptrace.h>
diff --git a/tools/perf/util/values.c b/tools/perf/util/values.c
index ec72d29f3d58..6eaddfcf833e 100644
--- a/tools/perf/util/values.c
+++ b/tools/perf/util/values.c
@@ -13,9 +13,9 @@
 int perf_read_values_init(struct perf_read_values *values)
 {
 	values->threads_max = 16;
-	values->pid = malloc(values->threads_max * sizeof(*values->pid));
-	values->tid = malloc(values->threads_max * sizeof(*values->tid));
-	values->value = zalloc(values->threads_max * sizeof(*values->value));
+	values->pid = calloc(values->threads_max, sizeof(*values->pid));
+	values->tid = calloc(values->threads_max, sizeof(*values->tid));
+	values->value = calloc(values->threads_max, sizeof(*values->value));
 	if (!values->pid || !values->tid || !values->value) {
 		pr_debug("failed to allocate read_values threads arrays");
 		goto out_free_pid;
@@ -96,7 +96,7 @@ static int perf_read_values__findnew_thread(struct perf_read_values *values,
 
 	i = values->threads;
 
-	values->value[i] = zalloc(values->counters_max * sizeof(**values->value));
+	values->value[i] = calloc(values->counters_max, sizeof(**values->value));
 	if (!values->value[i]) {
 		pr_debug("failed to allocate read_values counters array");
 		return -ENOMEM;

From 19a9ed115fda95317c98bef0c716ea8412cd8ce0 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 8 Apr 2026 14:32:03 -0300
Subject: [PATCH 106/131] perf tools: Replace basename() calls with
 perf_basename()

As noticed in a sashiko review for a patch adding a missing libgen.h
in a file using basename():

  https://sashiko.dev/#/patchset/20260402001740.2220481-1-acme%40kernel.org

So avoid these subtleties and instead reuse the gnu_basename() function
we had in srcline.c, renaming it to perf_basename() and replace
basename() calls with it, simplifying several cases by removing now
needless strdups.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/builtin-daemon.c         |  4 ++--
 tools/perf/util/annotate.c          |  3 +--
 tools/perf/util/data-convert-json.c |  4 ++--
 tools/perf/util/dsos.c              | 32 ++++++++---------------------
 tools/perf/util/probe-event.c       |  3 +--
 tools/perf/util/srcline.c           | 11 ++--------
 tools/perf/util/symbol.h            | 14 +++----------
 tools/perf/util/util.c              |  8 ++++++++
 tools/perf/util/util.h              |  2 ++
 9 files changed, 30 insertions(+), 51 deletions(-)

diff --git a/tools/perf/builtin-daemon.c b/tools/perf/builtin-daemon.c
index 33473e071392..c4632577d129 100644
--- a/tools/perf/builtin-daemon.c
+++ b/tools/perf/builtin-daemon.c
@@ -1016,7 +1016,7 @@ static int setup_config_changes(struct daemon *daemon)
 {
 	char *basen = strdup(daemon->config_real);
 	char *dirn  = strdup(daemon->config_real);
-	char *base, *dir;
+	const char *base, *dir;
 	int fd, wd = -1;
 
 	if (!dirn || !basen)
@@ -1029,7 +1029,7 @@ static int setup_config_changes(struct daemon *daemon)
 	}
 
 	dir = dirname(dirn);
-	base = basename(basen);
+	base = perf_basename(basen);
 	pr_debug("config file: %s, dir: %s\n", base, dir);
 
 	wd = inotify_add_watch(fd, dir, IN_CLOSE_WRITE);
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 63f0ee9d4c03..e745f3034a0e 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -8,7 +8,6 @@
 
 #include <errno.h>
 #include <inttypes.h>
-#include <libgen.h>
 #include <stdlib.h>
 #include "util.h" // hex_width()
 #include "ui/ui.h"
@@ -1245,7 +1244,7 @@ int hist_entry__annotate_printf(struct hist_entry *he, struct evsel *evsel)
 	if (opts->full_path)
 		d_filename = filename;
 	else
-		d_filename = basename(filename);
+		d_filename = perf_basename(filename);
 
 	if (evsel__is_group_event(evsel)) {
 		evsel__group_desc(evsel, buf, sizeof(buf));
diff --git a/tools/perf/util/data-convert-json.c b/tools/perf/util/data-convert-json.c
index 4b1b2f7bed25..d526c91312ed 100644
--- a/tools/perf/util/data-convert-json.c
+++ b/tools/perf/util/data-convert-json.c
@@ -326,7 +326,7 @@ static void output_headers(struct perf_session *session, struct convert_json *c)
 	output_json_format(out, false, 2, "]");
 }
 
-int bt_convert__perf2json(const char *input_name, const char *output_name,
+int bt_convert__perf2json(const char *_input_name, const char *output_name,
 		struct perf_data_convert_opts *opts __maybe_unused)
 {
 	struct perf_session *session;
@@ -342,7 +342,7 @@ int bt_convert__perf2json(const char *input_name, const char *output_name,
 	};
 	struct perf_data data = {
 		.mode = PERF_DATA_MODE_READ,
-		.path = input_name,
+		.path = _input_name,
 		.force = opts->force,
 	};
 
diff --git a/tools/perf/util/dsos.c b/tools/perf/util/dsos.c
index 5cf8c878bab2..e927e707abac 100644
--- a/tools/perf/util/dsos.c
+++ b/tools/perf/util/dsos.c
@@ -6,7 +6,6 @@
 #include "vdso.h"
 #include "namespaces.h"
 #include <errno.h>
-#include <libgen.h>
 #include <stdlib.h>
 #include <string.h>
 #include <symbol.h> // filename__read_build_id
@@ -297,34 +296,21 @@ struct dso *dsos__find(struct dsos *dsos, const char *name, bool cmp_short)
 
 static void dso__set_basename(struct dso *dso)
 {
-	char *base, *lname;
+	bool allocated = false;
+	const char *base;
 	int tid;
 
 	if (perf_pid_map_tid(dso__long_name(dso), &tid)) {
-		if (asprintf(&base, "[JIT] tid %d", tid) < 0)
+		char *jitname;
+
+		if (asprintf(&jitname, "[JIT] tid %d", tid) < 0)
 			return;
+		allocated = true;
+		base = jitname;
 	} else {
-	      /*
-	       * basename() may modify path buffer, so we must pass
-               * a copy.
-               */
-		lname = strdup(dso__long_name(dso));
-		if (!lname)
-			return;
-
-		/*
-		 * basename() may return a pointer to internal
-		 * storage which is reused in subsequent calls
-		 * so copy the result.
-		 */
-		base = strdup(basename(lname));
-
-		free(lname);
-
-		if (!base)
-			return;
+		base = perf_basename(dso__long_name(dso));
 	}
-	dso__set_short_name(dso, base, true);
+	dso__set_short_name(dso, base, allocated);
 }
 
 static struct dso *__dsos__addnew_id(struct dsos *dsos, const char *name, const struct dso_id *id)
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index f37a783ea772..34b4badd2c14 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -11,7 +11,6 @@
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <errno.h>
-#include <libgen.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <stdlib.h>
@@ -229,7 +228,7 @@ static int convert_exec_to_group(const char *exec, char **result)
 	if (!exec_copy)
 		return -ENOMEM;
 
-	ptr1 = basename(exec_copy);
+	ptr1 = (char *)perf_basename(exec_copy);
 	if (!ptr1) {
 		ret = -EINVAL;
 		goto out;
diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c
index b58710624ead..db164d258163 100644
--- a/tools/perf/util/srcline.c
+++ b/tools/perf/util/srcline.c
@@ -8,6 +8,7 @@
 #include "symbol.h"
 #include "libdw.h"
 #include "debug.h"
+#include "util.h"
 
 #include <inttypes.h>
 #include <string.h>
@@ -74,14 +75,6 @@ int inline_list__append_tail(struct symbol *symbol, char *srcline, struct inline
 	return 0;
 }
 
-/* basename version that takes a const input string */
-static const char *gnu_basename(const char *path)
-{
-	const char *base = strrchr(path, '/');
-
-	return base ? base + 1 : path;
-}
-
 char *srcline_from_fileline(const char *file, unsigned int line)
 {
 	char *srcline;
@@ -90,7 +83,7 @@ char *srcline_from_fileline(const char *file, unsigned int line)
 		return NULL;
 
 	if (!srcline_full_filename)
-		file = gnu_basename(file);
+		file = perf_basename(file);
 
 	if (asprintf(&srcline, "%s:%u", file, line) < 0)
 		return NULL;
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index c67814d6d6d6..bd6eb90c8668 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -14,6 +14,7 @@
 #include "path.h"
 #include "symbol_conf.h"
 #include "spark.h"
+#include "util.h"
 
 #ifdef HAVE_LIBELF_SUPPORT
 #include <libelf.h>
@@ -97,18 +98,9 @@ struct intlist;
 
 static inline int __symbol__join_symfs(char *bf, size_t size, const char *path)
 {
-	if (symbol_conf.symfs_layout_flat) {
-		char *path_copy = strdup(path);
-		char *base;
-		int ret;
+	if (symbol_conf.symfs_layout_flat)
+		return path__join(bf, size, symbol_conf.symfs, perf_basename(path));
 
-		if (!path_copy)
-			return -ENOMEM;
-		base = basename(path_copy);
-		ret = path__join(bf, size, symbol_conf.symfs, base);
-		free(path_copy);
-		return ret;
-	}
 	return path__join(bf, size, symbol_conf.symfs, path);
 }
 
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index c5fee8e39480..25849434f0a4 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -545,3 +545,11 @@ int scandirat(int dirfd, const char *dirp,
 	return err;
 }
 #endif
+
+/* basename version that takes a const input string */
+const char *perf_basename(const char *path)
+{
+	const char *base = strrchr(path, '/');
+
+	return base ? base + 1 : path;
+}
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index e935438451b8..87a0818a8c76 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -86,6 +86,8 @@ struct perf_debuginfod {
 };
 void perf_debuginfod_setup(struct perf_debuginfod *di);
 
+const char *perf_basename(const char *path);
+
 char *filename_with_chroot(int pid, const char *filename);
 
 int do_realloc_array_as_needed(void **arr, size_t *arr_sz, size_t x,

From 80b549be27de0f11124c66eaeb5307c7b4582edd Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Wed, 8 Apr 2026 13:38:58 -0700
Subject: [PATCH 107/131] perf data: Clean up use_stdio and structures

use_stdio was associated with struct perf_data and not perf_data_file
meaning there was implicit use of fd rather than fptr that may not be
safe. For example, in perf_data_file__write. Reorganize perf_data_file
to better abstract use_stdio, add kernel-doc and more consistently use
perf_data__ accessors so that use_stdio is better respected.

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/builtin-inject.c |  7 ++-
 tools/perf/builtin-record.c | 12 +++--
 tools/perf/tests/topology.c |  3 +-
 tools/perf/util/data.c      | 99 +++++++++++++++++++++++++------------
 tools/perf/util/data.h      | 52 ++++++++++++++++---
 tools/perf/util/session.c   |  2 +-
 6 files changed, 124 insertions(+), 51 deletions(-)

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index b4add7a70b22..f174bc69cec4 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -270,9 +270,8 @@ static s64 perf_event__repipe_auxtrace(const struct perf_tool *tool,
 	inject->have_auxtrace = true;
 
 	if (!inject->output.is_pipe) {
-		off_t offset;
+		off_t offset = perf_data__seek(&inject->output, 0, SEEK_CUR);
 
-		offset = lseek(inject->output.file.fd, 0, SEEK_CUR);
 		if (offset == -1)
 			return -errno;
 		ret = auxtrace_index__auxtrace_event(&session->auxtrace_index,
@@ -2503,12 +2502,12 @@ int cmd_inject(int argc, const char **argv)
 		.output = {
 			.path = "-",
 			.mode = PERF_DATA_MODE_WRITE,
-			.use_stdio = true,
+			.file.use_stdio = true,
 		},
 	};
 	struct perf_data data = {
 		.mode = PERF_DATA_MODE_READ,
-		.use_stdio = true,
+		.file.use_stdio = true,
 	};
 	int ret;
 	const char *known_build_ids = NULL;
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 1adc37b45152..4a5eba498c02 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -453,7 +453,7 @@ static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size
 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
 {
 	int ret, idx;
-	int trace_fd = rec->session->data->file.fd;
+	int trace_fd = perf_data__fd(rec->session->data);
 	struct record_aio aio = { .rec = rec, .size = 0 };
 
 	/*
@@ -1640,7 +1640,7 @@ static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
 	int rc = 0;
 	int nr_mmaps;
 	struct mmap **maps;
-	int trace_fd = rec->data.file.fd;
+	int trace_fd = perf_data__fd(&rec->data);
 	off_t off = 0;
 
 	if (!evlist)
@@ -1845,10 +1845,12 @@ record__finish_output(struct record *rec)
 	}
 
 	rec->session->header.data_size += rec->bytes_written;
-	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
+	data->file.size = perf_data__seek(data, 0, SEEK_CUR);
 	if (record__threads_enabled(rec)) {
-		for (i = 0; i < data->dir.nr; i++)
-			data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
+		for (i = 0; i < data->dir.nr; i++) {
+			data->dir.files[i].size =
+				perf_data_file__seek(&data->dir.files[i], 0, SEEK_CUR);
+		}
 	}
 
 	/* Buildid scanning disabled or build ID in kernel and synthesized map events. */
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 75b748ddf824..f54502ebef4b 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -54,7 +54,8 @@ static int session_write_header(char *path)
 	session->header.data_size += DATA_SIZE;
 
 	TEST_ASSERT_VAL("failed to write header",
-			!perf_session__write_header(session, session->evlist, data.file.fd, true));
+			!perf_session__write_header(session, session->evlist,
+						    perf_data__fd(&data), true));
 
 	evlist__delete(session->evlist);
 	perf_session__delete(session);
diff --git a/tools/perf/util/data.c b/tools/perf/util/data.c
index 14fa83dae71a..94dc534a7386 100644
--- a/tools/perf/util/data.c
+++ b/tools/perf/util/data.c
@@ -20,18 +20,33 @@
 #include "rlimit.h"
 #include <internal/lib.h>
 
+static void perf_data_file__close(struct perf_data_file *file)
+{
+	if (file->use_stdio) {
+		if (file->fptr) {
+			fclose(file->fptr);
+			file->fptr = NULL;
+		}
+	} else {
+		close(file->fd);
+		file->fd = -1;
+	}
+	zfree(&file->path);
+}
+
 static void close_dir(struct perf_data_file *files, int nr)
 {
-	while (--nr >= 0) {
-		close(files[nr].fd);
-		zfree(&files[nr].path);
-	}
+	while (--nr >= 0)
+		perf_data_file__close(&files[nr]);
+
 	free(files);
 }
 
 void perf_data__close_dir(struct perf_data *data)
 {
 	close_dir(data->dir.files, data->dir.nr);
+	data->dir.files = NULL;
+	data->dir.nr = 0;
 }
 
 int perf_data__create_dir(struct perf_data *data, int nr)
@@ -132,16 +147,21 @@ int perf_data__open_dir(struct perf_data *data)
 		files = file;
 		file = &files[nr++];
 
-		file->path = strdup(path);
+		*file = (struct perf_data_file){
+			.path = strdup(path),
+			.fd = -1,
+			.size = st.st_size,
+			.use_stdio = false,
+		};
 		if (!file->path)
 			goto out_err;
 
 		ret = open(file->path, O_RDONLY);
-		if (ret < 0)
+		if (ret < 0) {
+			ret = -errno;
 			goto out_err;
-
+		}
 		file->fd = ret;
-		file->size = st.st_size;
 	}
 
 	closedir(dir);
@@ -174,7 +194,7 @@ static bool check_pipe(struct perf_data *data)
 	}
 
 	if (is_pipe) {
-		if (data->use_stdio) {
+		if (data->file.use_stdio) {
 			const char *mode;
 
 			mode = perf_data__is_read(data) ? "r" : "w";
@@ -182,7 +202,7 @@ static bool check_pipe(struct perf_data *data)
 
 			if (data->file.fptr == NULL) {
 				data->file.fd = fd;
-				data->use_stdio = false;
+				data->file.use_stdio = false;
 			}
 
 		/*
@@ -344,7 +364,7 @@ int perf_data__open(struct perf_data *data)
 		return 0;
 
 	/* currently it allows stdio for pipe only */
-	data->use_stdio = false;
+	data->file.use_stdio = false;
 
 	if (!data->path)
 		data->path = "perf.data";
@@ -364,41 +384,57 @@ void perf_data__close(struct perf_data *data)
 	if (perf_data__is_dir(data))
 		perf_data__close_dir(data);
 
-	zfree(&data->file.path);
+	perf_data_file__close(&data->file);
+}
 
-	if (data->use_stdio)
-		fclose(data->file.fptr);
-	else
-		close(data->file.fd);
+static ssize_t perf_data_file__read(struct perf_data_file *file, void *buf, size_t size)
+{
+	if (file->use_stdio) {
+		if (fread(buf, size, 1, file->fptr) == 1)
+			return size;
+		return feof(file->fptr) ? 0 : -1;
+	}
+	return readn(file->fd, buf, size);
 }
 
 ssize_t perf_data__read(struct perf_data *data, void *buf, size_t size)
 {
-	if (data->use_stdio) {
-		if (fread(buf, size, 1, data->file.fptr) == 1)
-			return size;
-		return feof(data->file.fptr) ? 0 : -1;
-	}
-	return readn(data->file.fd, buf, size);
+	return perf_data_file__read(&data->file, buf, size);
 }
 
 ssize_t perf_data_file__write(struct perf_data_file *file,
 			      void *buf, size_t size)
 {
+	if (file->use_stdio) {
+		if (fwrite(buf, size, /*nmemb=*/1, file->fptr) == 1)
+			return size;
+		return -1;
+	}
 	return writen(file->fd, buf, size);
 }
 
 ssize_t perf_data__write(struct perf_data *data,
 			 void *buf, size_t size)
 {
-	if (data->use_stdio) {
-		if (fwrite(buf, size, 1, data->file.fptr) == 1)
-			return size;
-		return -1;
-	}
 	return perf_data_file__write(&data->file, buf, size);
 }
 
+off_t perf_data_file__seek(struct perf_data_file *file, off_t offset, int whence)
+{
+	if (file->use_stdio) {
+		off_t res = fseeko(file->fptr, offset, whence);
+
+		return res < 0 ? -1 : ftello(file->fptr);
+	}
+	return lseek(file->fd, offset, whence);
+}
+
+off_t perf_data__seek(struct perf_data *data, off_t offset, int whence)
+{
+	/* Note, a pipe fd will fail with -1 with errno of ESPIPE. */
+	return perf_data_file__seek(&data->file, offset, whence);
+}
+
 int perf_data__switch(struct perf_data *data,
 		      const char *postfix,
 		      size_t pos, bool at_exit,
@@ -420,19 +456,18 @@ int perf_data__switch(struct perf_data *data,
 		pr_warning("Failed to rename %s to %s\n", data->path, *new_filepath);
 
 	if (!at_exit) {
-		close(data->file.fd);
+		perf_data_file__close(&data->file);
 		ret = perf_data__open(data);
 		if (ret < 0)
 			goto out;
 
-		if (lseek(data->file.fd, pos, SEEK_SET) == (off_t)-1) {
+		if (perf_data__seek(data, pos, SEEK_SET) == (off_t)-1) {
 			ret = -errno;
-			pr_debug("Failed to lseek to %zu: %m\n",
-				 pos);
+			pr_debug("Failed to seek to %zu: %m", pos);
 			goto out;
 		}
 	}
-	ret = data->file.fd;
+	ret = perf_data__fd(data);
 out:
 	return ret;
 }
diff --git a/tools/perf/util/data.h b/tools/perf/util/data.h
index 1438e32e0451..8299fb5fa7da 100644
--- a/tools/perf/util/data.h
+++ b/tools/perf/util/data.h
@@ -17,32 +17,70 @@ enum perf_dir_version {
 	PERF_DIR_VERSION	= 1,
 };
 
+/**
+ * struct perf_data_file: A wrapper around a file used for perf.data reading or writing. Generally
+ * part of struct perf_data.
+ */
 struct perf_data_file {
+	/**
+	 * @path: Path of file. Generally a copy of perf_data.path but for a
+	 * directory it is the file within the directory.
+	 */
 	char		*path;
 	union {
+		/** @fd: File descriptor for read/writes. Valid if use_stdio is false. */
 		int	 fd;
+		/**
+		 * @fptr: Stdio FILE. Valid if use_stdio is true, currently just
+		 * pipes in perf inject.
+		 */
 		FILE	*fptr;
 	};
+	/** @size: Size of file when opened. */
 	unsigned long	 size;
+	/** @use_stdio: Use buffered stdio operations. */
+	bool		 use_stdio;
 };
 
+/**
+ * struct perf_data: A wrapper around a file used for perf.data reading or writing.
+ */
 struct perf_data {
+	/** @path: Path to open and of the file. NULL implies 'perf.data' will be used. */
 	const char		*path;
+	/** @file: Underlying file to be used. */
 	struct perf_data_file	 file;
+	/** @is_pipe: Underlying file is a pipe. */
 	bool			 is_pipe;
+	/** @is_dir: Underlying file is a directory. */
 	bool			 is_dir;
+	/** @force: Ignore opening a file creating created by a different user. */
 	bool			 force;
-	bool			 use_stdio;
+	/** @in_place_update: A file opened for reading but will be written to. */
 	bool			 in_place_update;
+	/** @mode: Read or write mode. */
 	enum perf_data_mode	 mode;
 
 	struct {
+		/** @version: perf_dir_version. */
 		u64			 version;
+		/** @files: perf data files for the directory. */
 		struct perf_data_file	*files;
+		/** @nr: Number of perf data files for the directory. */
 		int			 nr;
 	} dir;
 };
 
+static inline int perf_data_file__fd(struct perf_data_file *file)
+{
+	return file->use_stdio ? fileno(file->fptr) : file->fd;
+}
+
+ssize_t perf_data_file__write(struct perf_data_file *file,
+			      void *buf, size_t size);
+off_t perf_data_file__seek(struct perf_data_file *file, off_t offset, int whence);
+
+
 static inline bool perf_data__is_read(struct perf_data *data)
 {
 	return data->mode == PERF_DATA_MODE_READ;
@@ -70,10 +108,7 @@ static inline bool perf_data__is_single_file(struct perf_data *data)
 
 static inline int perf_data__fd(struct perf_data *data)
 {
-	if (data->use_stdio)
-		return fileno(data->file.fptr);
-
-	return data->file.fd;
+	return perf_data_file__fd(&data->file);
 }
 
 int perf_data__open(struct perf_data *data);
@@ -81,8 +116,7 @@ void perf_data__close(struct perf_data *data);
 ssize_t perf_data__read(struct perf_data *data, void *buf, size_t size);
 ssize_t perf_data__write(struct perf_data *data,
 			 void *buf, size_t size);
-ssize_t perf_data_file__write(struct perf_data_file *file,
-			      void *buf, size_t size);
+off_t perf_data__seek(struct perf_data *data, off_t offset, int whence);
 /*
  * If at_exit is set, only rename current perf.data to
  * perf.data.<postfix>, continue write on original data.
@@ -99,8 +133,10 @@ int perf_data__open_dir(struct perf_data *data);
 void perf_data__close_dir(struct perf_data *data);
 unsigned long perf_data__size(struct perf_data *data);
 int perf_data__make_kcore_dir(struct perf_data *data, char *buf, size_t buf_sz);
-bool has_kcore_dir(const char *path);
 char *perf_data__kallsyms_name(struct perf_data *data);
 char *perf_data__guest_kallsyms_name(struct perf_data *data, pid_t machine_pid);
+
+bool has_kcore_dir(const char *path);
 bool is_perf_data(const char *path);
+
 #endif /* __PERF_DATA_H */
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 312ea05e2113..fe0de2a0277f 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -2583,7 +2583,7 @@ static int __perf_session__process_dir_events(struct perf_session *session)
 		if (!data->dir.files[i].size)
 			continue;
 		rd[readers] = (struct reader) {
-			.fd		 = data->dir.files[i].fd,
+			.fd		 = perf_data_file__fd(&data->dir.files[i]),
 			.path		 = data->dir.files[i].path,
 			.data_size	 = data->dir.files[i].size,
 			.data_offset	 = 0,

From 4cf1f549bbcdfea9c20df52994bb342677472dcd Mon Sep 17 00:00:00 2001
From: Thomas Richter <tmricht@linux.ibm.com>
Date: Wed, 8 Apr 2026 13:31:43 +0200
Subject: [PATCH 108/131] perf test: Make perf trace BTF general tests
 exclusive

Running both tests cases 126 128 together causes the first test case
126 to fail:
 # for i in $(seq 3); do ./perf test 'perf trace BTF general tests' \
	'perf trace record and replay'; done
 126: perf trace BTF general tests    : FAILED!
 128: perf trace record and replay    : Ok
 126: perf trace BTF general tests    : FAILED!
 128: perf trace record and replay    : Ok
 126: perf trace BTF general tests    : FAILED!
 128: perf trace record and replay    : Ok
 #

Test case 126 fails because test case 128 runs concurrently as can
be observed using a ps -ef | grep perf output list on a different
window. Both do a perf trace command concurrently.
Make test case 'perf trace BTF general tests' exclusive.

Output after:
 # for i in $(seq 3); do ./perf test 'perf trace BTF general tests' \
	'perf trace record and replay'; done
 127: perf trace BTF general tests                   : Ok
 155: perf trace record and replay                   : Ok
 127: perf trace BTF general tests                   : Ok
 155: perf trace record and replay                   : Ok
 127: perf trace BTF general tests                   : Ok
 155: perf trace record and replay                   : Ok
 #

Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
Acked-by: Howard Chu <howardchu95@gmail.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/tests/shell/trace_btf_general.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/tests/shell/trace_btf_general.sh b/tools/perf/tests/shell/trace_btf_general.sh
index ef2da806be6b..7a94a5743924 100755
--- a/tools/perf/tests/shell/trace_btf_general.sh
+++ b/tools/perf/tests/shell/trace_btf_general.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# perf trace BTF general tests
+# perf trace BTF general tests (exclusive)
 # SPDX-License-Identifier: GPL-2.0
 
 err=0

From faaf70f938236b94b150320e452fe2d577936a42 Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@arm.com>
Date: Fri, 10 Apr 2026 08:36:58 +0100
Subject: [PATCH 109/131] perf sort: Support sort ASE and SME

Support sort Advance SIMD extension (ASE) and SME.

Reviewed-by: James Clark <james.clark@linaro.org>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Leo Yan <leo.yan@arm.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/sample.h | 12 +++++++++---
 tools/perf/util/sort.c   |  6 +++++-
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/tools/perf/util/sample.h b/tools/perf/util/sample.h
index 3d27a0daef8f..0e5ee7e0fb94 100644
--- a/tools/perf/util/sample.h
+++ b/tools/perf/util/sample.h
@@ -71,12 +71,18 @@ struct aux_sample {
 };
 
 struct simd_flags {
-	u8	arch:1,	/* architecture (isa) */
-		pred:2;	/* predication */
+	u8	arch:  2,	/* architecture (isa) */
+		pred:  2,	/* predication */
+		resv:  4;	/* reserved */
 };
 
 /* simd architecture flags */
-#define SIMD_OP_FLAGS_ARCH_SVE		0x01	/* ARM SVE */
+enum simd_op_flags {
+	SIMD_OP_FLAGS_ARCH_NONE = 0x0,	/* No SIMD operation */
+	SIMD_OP_FLAGS_ARCH_SVE,		/* Arm SVE */
+	SIMD_OP_FLAGS_ARCH_SME,		/* Arm SME */
+	SIMD_OP_FLAGS_ARCH_ASE,		/* Arm Advanced SIMD */
+};
 
 /* simd predicate flags */
 #define SIMD_OP_FLAGS_PRED_PARTIAL	0x01	/* partial predicate */
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 6ce684d68bd6..7198eb3ae560 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -195,8 +195,12 @@ static const char *hist_entry__get_simd_name(struct simd_flags *simd_flags)
 {
 	u64 arch = simd_flags->arch;
 
-	if (arch & SIMD_OP_FLAGS_ARCH_SVE)
+	if (arch == SIMD_OP_FLAGS_ARCH_SVE)
 		return "SVE";
+	else if (arch == SIMD_OP_FLAGS_ARCH_SME)
+		return "SME";
+	else if (arch == SIMD_OP_FLAGS_ARCH_ASE)
+		return "ASE";
 	else
 		return "n/a";
 }

From 0f648fc245c316d799f853d7ab97f2bfef68d7dd Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@arm.com>
Date: Fri, 10 Apr 2026 08:36:59 +0100
Subject: [PATCH 110/131] perf sort: Sort disabled and full predicated flags

According to the Arm ARM (ARM DDI 0487, L.a), section D18.2.6
"Events packet", apart from the empty predicate and partial
predicates, an SVE or SME operation can be predicate-disabled
or full predicated.

To provide complete results, introduce two predicate types for
these cases.

Reviewed-by: James Clark <james.clark@linaro.org>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Leo Yan <leo.yan@arm.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/sample.h | 13 +++++++++----
 tools/perf/util/sort.c   | 15 ++++++++++-----
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/tools/perf/util/sample.h b/tools/perf/util/sample.h
index 0e5ee7e0fb94..ca0c407c4423 100644
--- a/tools/perf/util/sample.h
+++ b/tools/perf/util/sample.h
@@ -72,8 +72,8 @@ struct aux_sample {
 
 struct simd_flags {
 	u8	arch:  2,	/* architecture (isa) */
-		pred:  2,	/* predication */
-		resv:  4;	/* reserved */
+		pred:  3,	/* predication */
+		resv:  3;	/* reserved */
 };
 
 /* simd architecture flags */
@@ -85,8 +85,13 @@ enum simd_op_flags {
 };
 
 /* simd predicate flags */
-#define SIMD_OP_FLAGS_PRED_PARTIAL	0x01	/* partial predicate */
-#define SIMD_OP_FLAGS_PRED_EMPTY	0x02	/* empty predicate */
+enum simd_pred_flags {
+	SIMD_OP_FLAGS_PRED_NONE = 0x0,	/* Not available */
+	SIMD_OP_FLAGS_PRED_PARTIAL,	/* partial predicate */
+	SIMD_OP_FLAGS_PRED_EMPTY,	/* empty predicate */
+	SIMD_OP_FLAGS_PRED_FULL,	/* full predicate */
+	SIMD_OP_FLAGS_PRED_DISABLED,	/* disabled predicate */
+};
 
 /**
  * struct perf_sample
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 7198eb3ae560..0020089cb13c 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -209,18 +209,23 @@ static int hist_entry__simd_snprintf(struct hist_entry *he, char *bf,
 				     size_t size, unsigned int width __maybe_unused)
 {
 	const char *name;
+	const char *pred_str = ".";
 
 	if (!he->simd_flags.arch)
 		return repsep_snprintf(bf, size, "");
 
 	name = hist_entry__get_simd_name(&he->simd_flags);
 
-	if (he->simd_flags.pred & SIMD_OP_FLAGS_PRED_EMPTY)
-		return repsep_snprintf(bf, size, "[e] %s", name);
-	else if (he->simd_flags.pred & SIMD_OP_FLAGS_PRED_PARTIAL)
-		return repsep_snprintf(bf, size, "[p] %s", name);
+	if (he->simd_flags.pred == SIMD_OP_FLAGS_PRED_EMPTY)
+		pred_str = "e";
+	else if (he->simd_flags.pred == SIMD_OP_FLAGS_PRED_PARTIAL)
+		pred_str = "p";
+	else if (he->simd_flags.pred == SIMD_OP_FLAGS_PRED_DISABLED)
+		pred_str = "d";
+	else if (he->simd_flags.pred == SIMD_OP_FLAGS_PRED_FULL)
+		pred_str = "f";
 
-	return repsep_snprintf(bf, size, "[.] %s", name);
+	return repsep_snprintf(bf, size, "[%s] %s", pred_str, name);
 }
 
 static struct sort_entry sort_simd = {

From 54940f15269e0a5f6249e8520f81c2b980111f42 Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@arm.com>
Date: Fri, 10 Apr 2026 08:37:00 +0100
Subject: [PATCH 111/131] perf report: Update document for SIMD flags

Update SIMD architecture and predicate flags.

Reviewed-by: James Clark <james.clark@linaro.org>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Leo Yan <leo.yan@arm.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/Documentation/perf-report.txt | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 52f316628e43..22f87eaa3279 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -136,7 +136,10 @@ OPTIONS
 	- addr: (Full) virtual address of the sampled instruction
 	- retire_lat: On X86, this reports pipeline stall of this instruction compared
 	  to the previous instruction in cycles. And currently supported only on X86
-	- simd: Flags describing a SIMD operation. "e" for empty Arm SVE predicate. "p" for partial Arm SVE predicate
+	- simd: Flags describing a SIMD operation. The architecture type can be Arm's
+	  ASE (Advanced SIMD extension), SVE, SME. It provides an extra tag for
+	  predicate: "e" for empty predicate, "p" for partial predicate, "d" for
+	  predicate disabled, and "f" for full predicate.
 	- type: Data type of sample memory access.
 	- typeoff: Offset in the data type of sample memory access.
 	- symoff: Offset in the symbol.

From 4e03d6494f9504f8af46ba68a2a8b6877c196789 Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@arm.com>
Date: Fri, 10 Apr 2026 08:37:01 +0100
Subject: [PATCH 112/131] perf arm_spe: Improve SIMD flags setting

Fill in ASE and SME operations for the SIMD arch field.

Also set the predicate flags for SVE and SME, but differences between
them: SME does not have a predicate flag, so the setting is based on
events. SVE provides a predicate flag to indicate whether the predicate
is disabled, which allows it to be distinguished into four cases: full
predicates, empty predicates, fully predicated, and disabled predicates.

After:

    perf report -s +simd
    ...
    0.06%     0.06%  sve-test  sve-test               [.] setz                                        [p] SVE
    0.06%     0.06%  sve-test  [kernel.kallsyms]      [k] do_raw_spin_lock
    0.06%     0.06%  sve-test  sve-test               [.] getz                                        [p] SVE
    0.06%     0.06%  sve-test  [kernel.kallsyms]      [k] timekeeping_advance
    0.06%     0.06%  sve-test  sve-test               [.] getz                                        [d] SVE
    0.06%     0.06%  sve-test  [kernel.kallsyms]      [k] update_load_avg
    0.06%     0.06%  sve-test  sve-test               [.] getz                                        [e] SVE
    0.05%     0.05%  sve-test  sve-test               [.] setz                                        [e] SVE
    0.05%     0.05%  sve-test  [kernel.kallsyms]      [k] update_curr
    0.05%     0.05%  sve-test  sve-test               [.] setz                                        [d] SVE
    0.05%     0.05%  sve-test  [kernel.kallsyms]      [k] do_raw_spin_unlock
    0.05%     0.05%  sve-test  [kernel.kallsyms]      [k] timekeeping_update_from_shadow.constprop.0
    0.05%     0.05%  sve-test  sve-test               [.] getz                                        [f] SVE
    0.05%     0.05%  sve-test  sve-test               [.] setz                                        [f] SVE

Reviewed-by: James Clark <james.clark@linaro.org>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Leo Yan <leo.yan@arm.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/arm-spe.c | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index 70dd9bee47c7..e5835042acdf 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -353,12 +353,26 @@ static struct simd_flags arm_spe__synth_simd_flags(const struct arm_spe_record *
 
 	if (record->op & ARM_SPE_OP_SVE)
 		simd_flags.arch |= SIMD_OP_FLAGS_ARCH_SVE;
+	else if (record->op & ARM_SPE_OP_SME)
+		simd_flags.arch |= SIMD_OP_FLAGS_ARCH_SME;
+	else if (record->op & (ARM_SPE_OP_ASE | ARM_SPE_OP_SIMD_FP))
+		simd_flags.arch |= SIMD_OP_FLAGS_ARCH_ASE;
 
-	if (record->type & ARM_SPE_SVE_PARTIAL_PRED)
-		simd_flags.pred |= SIMD_OP_FLAGS_PRED_PARTIAL;
-
-	if (record->type & ARM_SPE_SVE_EMPTY_PRED)
-		simd_flags.pred |= SIMD_OP_FLAGS_PRED_EMPTY;
+	if (record->op & ARM_SPE_OP_SVE) {
+		if (!(record->op & ARM_SPE_OP_PRED))
+			simd_flags.pred = SIMD_OP_FLAGS_PRED_DISABLED;
+		else if (record->type & ARM_SPE_SVE_PARTIAL_PRED)
+			simd_flags.pred = SIMD_OP_FLAGS_PRED_PARTIAL;
+		else if (record->type & ARM_SPE_SVE_EMPTY_PRED)
+			simd_flags.pred = SIMD_OP_FLAGS_PRED_EMPTY;
+		else
+			simd_flags.pred = SIMD_OP_FLAGS_PRED_FULL;
+	} else {
+		if (record->type & ARM_SPE_SVE_PARTIAL_PRED)
+			simd_flags.pred = SIMD_OP_FLAGS_PRED_PARTIAL;
+		else if (record->type & ARM_SPE_SVE_EMPTY_PRED)
+			simd_flags.pred = SIMD_OP_FLAGS_PRED_EMPTY;
+	}
 
 	return simd_flags;
 }

From fab205e49286ab01cbc6fa8debd65a5a6e6cca71 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Sat, 11 Apr 2026 19:08:04 -0700
Subject: [PATCH 113/131] perf sample: Fix documentation typo

s/PEF/PERF/

Signed-off-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/sample.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/sample.h b/tools/perf/util/sample.h
index ca0c407c4423..e556c9b656ea 100644
--- a/tools/perf/util/sample.h
+++ b/tools/perf/util/sample.h
@@ -126,7 +126,7 @@ struct perf_sample {
 	u64 time;
 	/** @addr: The sample event PERF_SAMPLE_ADDR value. */
 	u64 addr;
-	/** @id: The sample event PERF_SAMPLE_ID or PEF_SAMPLE_IDENTIFIER value. */
+	/** @id: The sample event PERF_SAMPLE_ID or PERF_SAMPLE_IDENTIFIER value. */
 	u64 id;
 	/** @stream_id: The sample event PERF_SAMPLE_STREAM_ID value. */
 	u64 stream_id;

From f823d7efb81cd2a799dc386da4f9292fdc2c1dbe Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 10 Apr 2026 19:08:53 -0300
Subject: [PATCH 114/131] perf header: Validate nr_domains when reading
 HEADER_CPU_DOMAIN_INFO

Further validate the HEADER_CPU_DOMAIN_INFO fields, this time checking
the nr_domains field.

Assisted-by: Claude Code:claude-opus-4-6
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/header.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index c6efddb70aee..a2796b72adc4 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -3731,6 +3731,12 @@ static int process_cpu_domain_info(struct feat_fd *ff, void *data __maybe_unused
 		if (do_read_u32(ff, &nr_domains))
 			return -1;
 
+		if (nr_domains > max_sched_domains) {
+			pr_err("Invalid HEADER_CPU_DOMAIN_INFO: nr_domains %u > max_sched_domains (%u)\n",
+			       nr_domains, max_sched_domains);
+			return -1;
+		}
+
 		cd_map[cpu]->nr_domains = nr_domains;
 
 		cd_map[cpu]->domains = calloc(max_sched_domains, sizeof(*d_info));

From 06452a412e5e89c62cd4917a457c5cfd43dc1ead Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 10 Apr 2026 19:08:54 -0300
Subject: [PATCH 115/131] perf header: Bump up the max number of command line
 args allowed

We need to do some upper limit validation, bump up the arbitrary limit
as per suggestion of Sashiko about command line wildcard expansion
ending up with more than 32768 args.

Link: https://sashiko.dev/#/patchset/20260408172846.96360-1-acme%40kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/header.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index a2796b72adc4..22c44b6f0b09 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -2795,8 +2795,11 @@ process_event_desc(struct feat_fd *ff, void *data __maybe_unused)
 	return 0;
 }
 
-// Some reasonable arbitrary max for the number of command line arguments
-#define MAX_CMDLINE_NR 32768
+/*
+ * Some arbitrary max for the number of command line arguments,
+ * Wildcards can expand and end up with tons of command line args.
+ */
+#define MAX_CMDLINE_NR 1048576
 
 static int process_cmdline(struct feat_fd *ff, void *data __maybe_unused)
 {

From 376ce5a9f706a75815c8281861b66060438798d1 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 10 Apr 2026 19:08:55 -0300
Subject: [PATCH 116/131] perf header: Sanity check HEADER_NRCPUS and
 HEADER_CPU_DOMAIN_INFO

While working on some cleanups sashiko questioned about pre-existing
issues, namely lacking sanity checks for perf.data headers, add some
with the help of Claude.

Cc: Ian Rogers <irogers@google.com>
Cc: Swapnil Sapkal <swapnil.sapkal@amd.com>
Assisted-by: Claude Code:claude-opus-4-6
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/header.c | 45 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 22c44b6f0b09..4cb748763c8a 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -63,6 +63,8 @@
 #include <event-parse.h>
 #endif
 
+#define MAX_SCHED_DOMAINS	64
+
 /*
  * magic2 = "PERFILE2"
  * must be a numerical value to let the endianness
@@ -2722,6 +2724,13 @@ static int process_nrcpus(struct feat_fd *ff, void *data __maybe_unused)
 	ret = do_read_u32(ff, &nr_cpus_online);
 	if (ret)
 		return ret;
+
+	if (nr_cpus_online > nr_cpus_avail) {
+		pr_err("Invalid HEADER_NRCPUS: nr_cpus_online (%u) > nr_cpus_avail (%u)\n",
+		       nr_cpus_online, nr_cpus_avail);
+		return -1;
+	}
+
 	env->nr_cpus_avail = (int)nr_cpus_avail;
 	env->nr_cpus_online = (int)nr_cpus_online;
 	return 0;
@@ -3698,6 +3707,17 @@ static int process_cpu_domain_info(struct feat_fd *ff, void *data __maybe_unused
 	nra = env->nr_cpus_avail;
 	nr = env->nr_cpus_online;
 
+	if (nra == 0 || nr == 0) {
+		pr_err("Invalid HEADER_CPU_DOMAIN_INFO: missing HEADER_NRCPUS\n");
+		return -1;
+	}
+
+	if (ff->size < 2 * sizeof(u32) + nr * 2 * sizeof(u32)) {
+		pr_err("Invalid HEADER_CPU_DOMAIN_INFO: section too small (%zu) for %u CPUs\n",
+		       (size_t)ff->size, nr);
+		return -1;
+	}
+
 	cd_map = calloc(nra, sizeof(*cd_map));
 	if (!cd_map)
 		return -1;
@@ -3714,6 +3734,18 @@ static int process_cpu_domain_info(struct feat_fd *ff, void *data __maybe_unused
 	if (ret)
 		return ret;
 
+	/*
+	 * Sanity check: real systems have at most ~10 sched domain levels
+	 * (SMT, CLS, MC, PKG + NUMA hops). Reject obviously bogus values
+	 * from malformed perf.data files before they cause excessive
+	 * allocation in the per-CPU loop.
+	 */
+	if (max_sched_domains > MAX_SCHED_DOMAINS) {
+		pr_err("Invalid HEADER_CPU_DOMAIN_INFO: max_sched_domains %u > %u\n",
+		       max_sched_domains, MAX_SCHED_DOMAINS);
+		return -1;
+	}
+
 	env->max_sched_domains = max_sched_domains;
 
 	for (i = 0; i < nr; i++) {
@@ -3725,6 +3757,11 @@ static int process_cpu_domain_info(struct feat_fd *ff, void *data __maybe_unused
 			return -1;
 		}
 
+		if (cd_map[cpu]) {
+			pr_err("Invalid HEADER_CPU_DOMAIN_INFO: duplicate cpu %u\n", cpu);
+			return -1;
+		}
+
 		cd_map[cpu] = zalloc(sizeof(*cd_map[cpu]));
 		if (!cd_map[cpu])
 			return -1;
@@ -3760,7 +3797,13 @@ static int process_cpu_domain_info(struct feat_fd *ff, void *data __maybe_unused
 			if (!d_info)
 				return -1;
 
-			assert(cd_map[cpu]->domains[domain] == NULL);
+			if (cd_map[cpu]->domains[domain]) {
+				pr_err("Invalid HEADER_CPU_DOMAIN_INFO: duplicate domain %u for cpu %u\n",
+				       domain, cpu);
+				free(d_info);
+				return -1;
+			}
+
 			cd_map[cpu]->domains[domain] = d_info;
 			d_info->domain = domain;
 

From 22a2e2b29217455cf337c765fc26ad2f55d7291a Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 10 Apr 2026 19:08:56 -0300
Subject: [PATCH 117/131] perf header: Sanity check HEADER_CPU_TOPOLOGY

Add validation to process_cpu_topology() to harden against malformed
perf.data files:

- Verify nr_cpus_avail was initialized (HEADER_NRCPUS processed first)
- Bounds check sibling counts (cores, threads, dies) against nr_cpus_avail
- Fix two bare 'return -1' that leaked env->cpu by using 'goto free_cpu'

Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Ian Rogers <irogers@google.com>
Assisted-by: Claude Code:claude-opus-4-6
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/header.c | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 4cb748763c8a..acd6b07528e0 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -2861,6 +2861,11 @@ static int process_cpu_topology(struct feat_fd *ff, void *data __maybe_unused)
 	int cpu_nr = env->nr_cpus_avail;
 	u64 size = 0;
 
+	if (cpu_nr == 0) {
+		pr_err("Invalid HEADER_CPU_TOPOLOGY: missing HEADER_NRCPUS\n");
+		return -1;
+	}
+
 	env->cpu = calloc(cpu_nr, sizeof(*env->cpu));
 	if (!env->cpu)
 		return -1;
@@ -2868,6 +2873,12 @@ static int process_cpu_topology(struct feat_fd *ff, void *data __maybe_unused)
 	if (do_read_u32(ff, &nr))
 		goto free_cpu;
 
+	if (nr > (u32)cpu_nr) {
+		pr_err("Invalid HEADER_CPU_TOPOLOGY: nr_sibling_cores (%u) > nr_cpus_avail (%d)\n",
+		       nr, cpu_nr);
+		goto free_cpu;
+	}
+
 	env->nr_sibling_cores = nr;
 	size += sizeof(u32);
 	if (strbuf_init(&sb, 128) < 0)
@@ -2887,7 +2898,13 @@ static int process_cpu_topology(struct feat_fd *ff, void *data __maybe_unused)
 	env->sibling_cores = strbuf_detach(&sb, NULL);
 
 	if (do_read_u32(ff, &nr))
-		return -1;
+		goto free_cpu;
+
+	if (nr > (u32)cpu_nr) {
+		pr_err("Invalid HEADER_CPU_TOPOLOGY: nr_sibling_threads (%u) > nr_cpus_avail (%d)\n",
+		       nr, cpu_nr);
+		goto free_cpu;
+	}
 
 	env->nr_sibling_threads = nr;
 	size += sizeof(u32);
@@ -2936,7 +2953,13 @@ static int process_cpu_topology(struct feat_fd *ff, void *data __maybe_unused)
 		return 0;
 
 	if (do_read_u32(ff, &nr))
-		return -1;
+		goto free_cpu;
+
+	if (nr > (u32)cpu_nr) {
+		pr_err("Invalid HEADER_CPU_TOPOLOGY: nr_sibling_dies (%u) > nr_cpus_avail (%d)\n",
+		       nr, cpu_nr);
+		goto free_cpu;
+	}
 
 	env->nr_sibling_dies = nr;
 	size += sizeof(u32);

From 4ba223016b0be7ec11aad63f480cd251cecad594 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 10 Apr 2026 19:08:57 -0300
Subject: [PATCH 118/131] perf header: Sanity check HEADER_NUMA_TOPOLOGY

Add validation to process_numa_topology() to harden against malformed
perf.data files:

- Upper bound check on nr_nodes (max 4096)
- Minimum section size check before allocating

Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Ian Rogers <irogers@google.com>
Assisted-by: Claude Code:claude-opus-4-6
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/header.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index acd6b07528e0..2f405776e501 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -63,6 +63,7 @@
 #include <event-parse.h>
 #endif
 
+#define MAX_NUMA_NODES		4096
 #define MAX_SCHED_DOMAINS	64
 
 /*
@@ -3005,6 +3006,18 @@ static int process_numa_topology(struct feat_fd *ff, void *data __maybe_unused)
 	if (do_read_u32(ff, &nr))
 		return -1;
 
+	if (nr > MAX_NUMA_NODES) {
+		pr_err("Invalid HEADER_NUMA_TOPOLOGY: nr_nodes (%u) > %u\n",
+		       nr, MAX_NUMA_NODES);
+		return -1;
+	}
+
+	if (ff->size < sizeof(u32) + nr * (sizeof(u32) + 2 * sizeof(u64))) {
+		pr_err("Invalid HEADER_NUMA_TOPOLOGY: section too small (%zu) for %u nodes\n",
+		       ff->size, nr);
+		return -1;
+	}
+
 	nodes = calloc(nr, sizeof(*nodes));
 	if (!nodes)
 		return -ENOMEM;

From a881fc56038a7baa5cb5074cdd52315d9ad9ee63 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 10 Apr 2026 19:08:58 -0300
Subject: [PATCH 119/131] perf header: Sanity check HEADER_MEM_TOPOLOGY

Add validation to process_mem_topology() to harden against malformed
perf.data files:

- Upper bound check on nr_nodes (reuses MAX_NUMA_NODES, 4096)
- Minimum section size check before allocating

This is particularly important here since nr is u64, making unbounded
values especially dangerous.

Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Ian Rogers <irogers@google.com>
Assisted-by: Claude Code:claude-opus-4-6
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/header.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 2f405776e501..2eb909672f82 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -3308,6 +3308,18 @@ static int process_mem_topology(struct feat_fd *ff,
 	if (do_read_u64(ff, &nr))
 		return -1;
 
+	if (nr > MAX_NUMA_NODES) {
+		pr_err("Invalid HEADER_MEM_TOPOLOGY: nr_nodes (%llu) > %u\n",
+		       (unsigned long long)nr, MAX_NUMA_NODES);
+		return -1;
+	}
+
+	if (ff->size < 3 * sizeof(u64) + nr * 2 * sizeof(u64)) {
+		pr_err("Invalid HEADER_MEM_TOPOLOGY: section too small (%zu) for %llu nodes\n",
+		       ff->size, (unsigned long long)nr);
+		return -1;
+	}
+
 	nodes = calloc(nr, sizeof(*nodes));
 	if (!nodes)
 		return -1;

From f613a6d694aa499edb2a291ab2c2d906619585f2 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 10 Apr 2026 19:08:59 -0300
Subject: [PATCH 120/131] perf header: Sanity check HEADER_PMU_MAPPINGS

Add upper bound check on pmu_num in process_pmu_mappings() to harden
against malformed perf.data files (max 4096).

Cc: Ian Rogers <irogers@google.com>
Assisted-by: Claude Code:claude-opus-4-6
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/header.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 2eb909672f82..77035d9b138c 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -64,6 +64,7 @@
 #endif
 
 #define MAX_NUMA_NODES		4096
+#define MAX_PMU_MAPPINGS	4096
 #define MAX_SCHED_DOMAINS	64
 
 /*
@@ -3069,6 +3070,18 @@ static int process_pmu_mappings(struct feat_fd *ff, void *data __maybe_unused)
 		return 0;
 	}
 
+	if (pmu_num > MAX_PMU_MAPPINGS) {
+		pr_err("Invalid HEADER_PMU_MAPPINGS: pmu_num (%u) > %u\n",
+		       pmu_num, MAX_PMU_MAPPINGS);
+		return -1;
+	}
+
+	if (ff->size < sizeof(u32) + pmu_num * 2 * sizeof(u32)) {
+		pr_err("Invalid HEADER_PMU_MAPPINGS: section too small (%zu) for %u PMUs\n",
+		       ff->size, pmu_num);
+		return -1;
+	}
+
 	env->nr_pmu_mappings = pmu_num;
 	if (strbuf_init(&sb, 128) < 0)
 		return -1;

From 6830e20c92e7388ae4834a3574a0d3d90500c4c1 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 10 Apr 2026 19:09:00 -0300
Subject: [PATCH 121/131] perf header: Sanity check HEADER_GROUP_DESC

Add upper bound check on nr_groups in process_group_desc() to harden
against malformed perf.data files (max 32768), and move the env
assignment after validation.

Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Ian Rogers <irogers@google.com>
Assisted-by: Claude Code:claude-opus-4-6
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/header.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 77035d9b138c..993e20debd5c 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -63,6 +63,7 @@
 #include <event-parse.h>
 #endif
 
+#define MAX_GROUP_DESC		32768
 #define MAX_NUMA_NODES		4096
 #define MAX_PMU_MAPPINGS	4096
 #define MAX_SCHED_DOMAINS	64
@@ -3132,12 +3133,25 @@ static int process_group_desc(struct feat_fd *ff, void *data __maybe_unused)
 	if (do_read_u32(ff, &nr_groups))
 		return -1;
 
-	env->nr_groups = nr_groups;
 	if (!nr_groups) {
 		pr_debug("group desc not available\n");
 		return 0;
 	}
 
+	if (nr_groups > MAX_GROUP_DESC) {
+		pr_err("Invalid HEADER_GROUP_DESC: nr_groups (%u) > %u\n",
+		       nr_groups, MAX_GROUP_DESC);
+		return -1;
+	}
+
+	if (ff->size < sizeof(u32) + nr_groups * 3 * sizeof(u32)) {
+		pr_err("Invalid HEADER_GROUP_DESC: section too small (%zu) for %u groups\n",
+		       ff->size, nr_groups);
+		return -1;
+	}
+
+	env->nr_groups = nr_groups;
+
 	desc = calloc(nr_groups, sizeof(*desc));
 	if (!desc)
 		return -1;

From 110a661708a6a90997442f02f261e2043624a1c8 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 10 Apr 2026 19:09:01 -0300
Subject: [PATCH 122/131] perf header: Sanity check HEADER_CACHE

Add upper bound check on cache entry count in process_cache() to harden
against malformed perf.data files (max 32768).

Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Ian Rogers <irogers@google.com>
Assisted-by: Claude Code:claude-opus-4-6
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/header.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 993e20debd5c..749a522fe057 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -63,6 +63,7 @@
 #include <event-parse.h>
 #endif
 
+#define MAX_CACHE_ENTRIES	32768
 #define MAX_GROUP_DESC		32768
 #define MAX_NUMA_NODES		4096
 #define MAX_PMU_MAPPINGS	4096
@@ -3243,6 +3244,18 @@ static int process_cache(struct feat_fd *ff, void *data __maybe_unused)
 	if (do_read_u32(ff, &cnt))
 		return -1;
 
+	if (cnt > MAX_CACHE_ENTRIES) {
+		pr_err("Invalid HEADER_CACHE: cnt (%u) > %u\n",
+		       cnt, MAX_CACHE_ENTRIES);
+		return -1;
+	}
+
+	if (ff->size < 2 * sizeof(u32) + cnt * 7 * sizeof(u32)) {
+		pr_err("Invalid HEADER_CACHE: section too small (%zu) for %u entries\n",
+		       ff->size, cnt);
+		return -1;
+	}
+
 	caches = calloc(cnt, sizeof(*caches));
 	if (!caches)
 		return -1;

From 47c68eb15ae90fa3953db9a67b4569089ff63cd0 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 10 Apr 2026 19:09:02 -0300
Subject: [PATCH 123/131] perf header: Sanity check HEADER_HYBRID_TOPOLOGY

Add upper bound check on nr_nodes in process_hybrid_topology() to
harden against malformed perf.data files (reuses MAX_PMU_MAPPINGS,
4096).

Cc: Ian Rogers <irogers@google.com>
Assisted-by: Claude Code:claude-opus-4-6
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/header.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 749a522fe057..a609fc7d959f 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -3450,6 +3450,18 @@ static int process_hybrid_topology(struct feat_fd *ff,
 	if (do_read_u32(ff, &nr))
 		return -1;
 
+	if (nr > MAX_PMU_MAPPINGS) {
+		pr_err("Invalid HEADER_HYBRID_TOPOLOGY: nr_nodes (%u) > %u\n",
+		       nr, MAX_PMU_MAPPINGS);
+		return -1;
+	}
+
+	if (ff->size < sizeof(u32) + nr * 2 * sizeof(u32)) {
+		pr_err("Invalid HEADER_HYBRID_TOPOLOGY: section too small (%zu) for %u nodes\n",
+		       ff->size, nr);
+		return -1;
+	}
+
 	nodes = calloc(nr, sizeof(*nodes));
 	if (!nodes)
 		return -ENOMEM;

From f5722a6b6a443fd56ce0a71b4be4c75d7a857dbe Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 10 Apr 2026 19:09:03 -0300
Subject: [PATCH 124/131] perf header: Sanity check HEADER_PMU_CAPS

Add upper bound checks in PMU capabilities processing to harden against
malformed perf.data files:

- nr_pmu bounded to MAX_PMU_MAPPINGS (4096) in process_pmu_caps()
- nr_pmu_caps bounded to MAX_PMU_CAPS (512) in __process_pmu_caps()

Cc: Ravi Bangoria <ravi.bangoria@amd.com>
Cc: Ian Rogers <irogers@google.com>
Assisted-by: Claude Code:claude-opus-4-6
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/header.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index a609fc7d959f..37c1afbc0816 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -66,6 +66,7 @@
 #define MAX_CACHE_ENTRIES	32768
 #define MAX_GROUP_DESC		32768
 #define MAX_NUMA_NODES		4096
+#define MAX_PMU_CAPS		512
 #define MAX_PMU_MAPPINGS	4096
 #define MAX_SCHED_DOMAINS	64
 
@@ -3677,6 +3678,12 @@ static int __process_pmu_caps(struct feat_fd *ff, int *nr_caps,
 	if (!nr_pmu_caps)
 		return 0;
 
+	if (nr_pmu_caps > MAX_PMU_CAPS) {
+		pr_err("Invalid pmu caps: nr_pmu_caps (%u) > %u\n",
+		       nr_pmu_caps, MAX_PMU_CAPS);
+		return -1;
+	}
+
 	*caps = calloc(nr_pmu_caps, sizeof(char *));
 	if (!*caps)
 		return -1;
@@ -3754,6 +3761,18 @@ static int process_pmu_caps(struct feat_fd *ff, void *data __maybe_unused)
 		return 0;
 	}
 
+	if (nr_pmu > MAX_PMU_MAPPINGS) {
+		pr_err("Invalid HEADER_PMU_CAPS: nr_pmu (%u) > %u\n",
+		       nr_pmu, MAX_PMU_MAPPINGS);
+		return -1;
+	}
+
+	if (ff->size < sizeof(u32) + nr_pmu * sizeof(u32)) {
+		pr_err("Invalid HEADER_PMU_CAPS: section too small (%zu) for %u PMUs\n",
+		       ff->size, nr_pmu);
+		return -1;
+	}
+
 	pmu_caps = calloc(nr_pmu, sizeof(*pmu_caps));
 	if (!pmu_caps)
 		return -ENOMEM;

From 66af7e9b05c4e7ff435c0aef0d253a65d290f03c Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 10 Apr 2026 19:09:04 -0300
Subject: [PATCH 125/131] perf header: Sanity check HEADER_BPF_PROG_INFO

Add validation to process_bpf_prog_info() to harden against malformed
perf.data files:

- Upper bound on BPF program count (max 131072)
- Upper bound on per-program data_len (max 256MB)

Cc: Ian Rogers <irogers@google.com>
Assisted-by: Claude Code:claude-opus-4-6
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/header.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 37c1afbc0816..705f1ab44bc9 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -63,6 +63,8 @@
 #include <event-parse.h>
 #endif
 
+#define MAX_BPF_DATA_LEN	(256 * 1024 * 1024)
+#define MAX_BPF_PROGS		131072
 #define MAX_CACHE_ENTRIES	32768
 #define MAX_GROUP_DESC		32768
 #define MAX_NUMA_NODES		4096
@@ -3525,6 +3527,18 @@ static int process_bpf_prog_info(struct feat_fd *ff __maybe_unused, void *data _
 	if (do_read_u32(ff, &count))
 		return -1;
 
+	if (count > MAX_BPF_PROGS) {
+		pr_err("Invalid HEADER_BPF_PROG_INFO: count (%u) > %u\n",
+		       count, MAX_BPF_PROGS);
+		return -1;
+	}
+
+	if (ff->size < sizeof(u32) + count * (2 * sizeof(u32) + sizeof(u64))) {
+		pr_err("Invalid HEADER_BPF_PROG_INFO: section too small (%zu) for %u entries\n",
+		       ff->size, count);
+		return -1;
+	}
+
 	down_write(&env->bpf_progs.lock);
 
 	for (i = 0; i < count; ++i) {
@@ -3542,6 +3556,12 @@ static int process_bpf_prog_info(struct feat_fd *ff __maybe_unused, void *data _
 			goto out;
 		}
 
+		if (data_len > MAX_BPF_DATA_LEN) {
+			pr_warning("Invalid HEADER_BPF_PROG_INFO: data_len (%u) too large\n",
+				   data_len);
+			goto out;
+		}
+
 		info_linear = malloc(sizeof(struct perf_bpil) +
 				     data_len);
 		if (!info_linear)

From dff56bdafae8e65d9acb88cc98e1f5129c352201 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 10 Apr 2026 19:09:05 -0300
Subject: [PATCH 126/131] perf header: Add sanity checks to HEADER_BPF_BTF
 processing

Validate the BTF entry count and individual data sizes when reading
HEADER_BPF_BTF from perf.data files to prevent excessive memory
allocation from malformed files.

Reuses the MAX_BPF_PROGS (131072) and MAX_BPF_DATA_LEN (256 MB)
limits from HEADER_BPF_PROG_INFO processing.

Cc: Song Liu <song@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Assisted-by: Claude Code:claude-opus-4-6
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/header.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 705f1ab44bc9..f30e48eb3fc3 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -3622,6 +3622,17 @@ static int process_bpf_btf(struct feat_fd *ff  __maybe_unused, void *data __mayb
 	if (do_read_u32(ff, &count))
 		return -1;
 
+	if (count > MAX_BPF_PROGS) {
+		pr_err("bpf btf count %u too large (max %u)\n", count, MAX_BPF_PROGS);
+		return -1;
+	}
+
+	if (ff->size < sizeof(u32) + count * 2 * sizeof(u32)) {
+		pr_err("Invalid HEADER_BPF_BTF: section too small (%zu) for %u entries\n",
+		       ff->size, count);
+		return -1;
+	}
+
 	down_write(&env->bpf_progs.lock);
 
 	for (i = 0; i < count; ++i) {
@@ -3632,6 +3643,12 @@ static int process_bpf_btf(struct feat_fd *ff  __maybe_unused, void *data __mayb
 		if (do_read_u32(ff, &data_size))
 			goto out;
 
+		if (data_size > MAX_BPF_DATA_LEN) {
+			pr_err("bpf btf data size %u too large (max %u)\n",
+			       data_size, MAX_BPF_DATA_LEN);
+			goto out;
+		}
+
 		node = malloc(sizeof(struct btf_node) + data_size);
 		if (!node)
 			goto out;

From 97ab89686a9e5d087042dbe73604a32b3de72653 Mon Sep 17 00:00:00 2001
From: Markus Mayer <mmayer@broadcom.com>
Date: Thu, 9 Apr 2026 15:14:17 -0700
Subject: [PATCH 127/131] perf build: fix "argument list too long" in second
 location

Turns out that displaying "RM $^" via quiet_cmd_rm can also upset the
shell and cause it to display "argument list too long".

Trying to quote $^ doesn't help.

In the end, *not* displaying the (potentially long) list of files is
probably the right thing to do for a "quiet" message, anyway. Instead,
let's display a count of how many files were removed. There is always
V=1 if more detail is required.

  TEST    linux/tools/perf/pmu-events/metric_test.log
  RM      ...634 orphan file(s)...
  LD      linux/tools/perf/util/perf-util-in.o

Also move the comment regarding xargs before the rule, so it doesn't
show up in the build output.

Signed-off-by: Markus Mayer <mmayer@broadcom.com>
Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/pmu-events/Build | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/perf/pmu-events/Build b/tools/perf/pmu-events/Build
index dc5f94862a3b..dc1df2d57ddc 100644
--- a/tools/perf/pmu-events/Build
+++ b/tools/perf/pmu-events/Build
@@ -211,10 +211,10 @@ ifneq ($(strip $(ORPHAN_FILES)),)
 
 # Message for $(call echo-cmd,rm). Generally cleaning files isn't part
 # of a build step.
-quiet_cmd_rm  = RM      $^
+quiet_cmd_rm = RM      ...$(words $^) orphan file(s)...
 
+# The list of files can be long. Use xargs to prevent issues.
 prune_orphans: $(ORPHAN_FILES)
-	# The list of files can be long. Use xargs to prevent issues.
 	$(Q)$(call echo-cmd,rm)echo "$^" | xargs rm -f
 
 JEVENTS_DEPS += prune_orphans

From c7fe4e5665b7c31a24d362229182f6ee27e07233 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Sat, 11 Apr 2026 12:37:05 -0700
Subject: [PATCH 128/131] perf test: Fix inet_pton probe failure and unroll
 call graph

When adding a probe for libc's inet_pton, perf probe may create multiple
probe points (e.g., due to inlining or multiple symbol resolutions),
resulting in multiple identical event names being output (e.g.,
`probe_libc:inet_pton_1`).

The script previously used a brittle pipeline (`tail -n +2 | head -n -5`)
and an awk script to extract the event name. When multiple probes were
added, awk would output the event name multiple times, which expanded
to multiple words in bash. This broke the subsequent `perf record` and
`perf probe -d` commands, causing the test to fail with:
`Error: another command except --add is set.`

Fix this by removing the brittle `tail/head` commands and appending
`| head -n 1` to the awk extraction. This ensures that only a single,
unique event name is captured, regardless of how many probe points
are created.

Additionally, the test artificially limited the backtrace size via
`max-stack=4` and did not specify dwarf call graphs for non-s390x
architectures. In newer libc versions where `inet_pton` is nested
deeper or compiled without frame pointers, `perf script` failed to resolve
the backtrace up to `/bin/ping`. Fix this by explicitly collecting
dwarf call-graphs for all architectures and increasing `max-stack` to 8.

Assisted-by: Gemini:gemini-3.1-pro-preview
Signed-off-by: Ian Rogers <irogers@google.com>
Tested-by: Thomas Richter <tmricht@linux.ibm.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/tests/shell/record+probe_libc_inet_pton.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/perf/tests/shell/record+probe_libc_inet_pton.sh b/tools/perf/tests/shell/record+probe_libc_inet_pton.sh
index ab99bef556bf..eca629ee83f0 100755
--- a/tools/perf/tests/shell/record+probe_libc_inet_pton.sh
+++ b/tools/perf/tests/shell/record+probe_libc_inet_pton.sh
@@ -22,9 +22,9 @@ event_pattern='probe_libc:inet_pton(_[[:digit:]]+)?'
 
 add_libc_inet_pton_event() {
 
-	event_name=$(perf probe -f -x $libc -a inet_pton 2>&1 | tail -n +2 | head -n -5 | \
+	event_name=$(perf probe -f -x $libc -a inet_pton 2>&1 | \
 			awk -v ep="$event_pattern" -v l="$libc" '$0 ~ ep && $0 ~ \
-			("\\(on inet_pton in " l "\\)") {print $1}')
+			("\\(on inet_pton in " l "\\)") {print $1}' | head -n 1)
 
 	if [ $? -ne 0 ] || [ -z "$event_name" ] ; then
 		printf "FAIL: could not add event\n"
@@ -40,12 +40,12 @@ trace_libc_inet_pton_backtrace() {
 	echo ".*inet_pton\+0x[[:xdigit:]]+[[:space:]]\($libc|inlined\)$" >> $expected
 	case "$(uname -m)" in
 	s390x)
-		eventattr='call-graph=dwarf,max-stack=4'
+		eventattr='call-graph=dwarf,max-stack=8'
 		echo "((__GI_)?getaddrinfo|text_to_binary_address)\+0x[[:xdigit:]]+[[:space:]]\($libc|inlined\)$" >> $expected
 		echo "(gaih_inet|main)\+0x[[:xdigit:]]+[[:space:]]\(inlined|.*/bin/ping.*\)$" >> $expected
 		;;
 	*)
-		eventattr='max-stack=4'
+		eventattr='call-graph=dwarf,max-stack=8'
 		echo ".*(\+0x[[:xdigit:]]+|\[unknown\])[[:space:]]\(.*/bin/ping.*\)$" >> $expected
 		;;
 	esac

From 86d1095fdb7017a93e9d7be875775f7e5aa5c2f5 Mon Sep 17 00:00:00 2001
From: Ian Rogers <irogers@google.com>
Date: Wed, 8 Apr 2026 17:02:16 -0700
Subject: [PATCH 129/131] perf test: Fixes for check branch stack sampling

When filtering branch stack samples on user events they sample in user
land but may have come from the kernel. Aarch64 avoids leaking the
kernel address for kaslr reasons but other platforms, for now,
don't. Be more permissive in allowing kernel addresses in the source
of user branch stacks.

When filtering branch stack samples on kernel events they sample in
kernel land but may have come from user land. Avoid the target being a
user address but allow the source to be in user land. Aarch64 may not
leak the user land addresses (making them 0) but other platforms
do. As the kernel address sampling implies privelege, just allow this.

Increase the duration of the system call sampling test to make the
likelihood of sampling a system call higher (increased from 1000 to
8000 loops - a number found through experimentation on an Intel
Tigerlake laptop), also make the period of the event a prime number.

Put unneeded perf record output into a temporary file so that the test
output isn't cluttered. More clearly state which test is running and
the pass, fail or skipped result of the test.

These changes make the test on an Intel tigerlake laptop reliably pass
rather than reliably fail.

Signed-off-by: Ian Rogers <irogers@google.com>
Reviewed-by: James Clark <james.clark@linaro.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/tests/shell/test_brstack.sh | 148 ++++++++++++++++---------
 1 file changed, 97 insertions(+), 51 deletions(-)

diff --git a/tools/perf/tests/shell/test_brstack.sh b/tools/perf/tests/shell/test_brstack.sh
index 85233d435be6..eb5837f82e39 100755
--- a/tools/perf/tests/shell/test_brstack.sh
+++ b/tools/perf/tests/shell/test_brstack.sh
@@ -38,9 +38,13 @@ is_arm64() {
 	[ "$(uname -m)" = "aarch64" ];
 }
 
+has_kaslr_bug() {
+	[ "$(uname -m)" != "aarch64" ];
+}
+
 check_branches() {
 	if ! tr -s ' ' '\n' < "$TMPDIR/perf.script" | grep -E -m1 -q "$1"; then
-		echo "Branches missing $1"
+		echo "ERROR: Branches missing $1"
 		err=1
 	fi
 }
@@ -48,6 +52,8 @@ check_branches() {
 test_user_branches() {
 	echo "Testing user branch stack sampling"
 
+	start_err=$err
+	err=0
 	perf record -o "$TMPDIR/perf.data" --branch-filter any,save_type,u -- ${TESTPROG} > "$TMPDIR/record.txt" 2>&1
 	perf script -i "$TMPDIR/perf.data" --fields brstacksym > "$TMPDIR/perf.script"
 
@@ -73,59 +79,88 @@ test_user_branches() {
 	perf script -i "$TMPDIR/perf.data" --fields brstack | \
 		tr ' ' '\n' > "$TMPDIR/perf.script"
 
-	# There should be no kernel addresses with the u option, in either
-	# source or target addresses.
-	if grep -E -m1 "0x[89a-f][0-9a-f]{15}" $TMPDIR/perf.script; then
-		echo "ERROR: Kernel address found in user mode"
+	# There should be no kernel addresses in the target with the u option.
+	local regex="0x[89a-f][0-9a-f]{15}"
+	if has_kaslr_bug; then
+		# If the system has a kaslr bug that may leak kernel addresses
+		# in the source of something like an ERET/SYSRET. Make the regex
+		# more specific and just check the target address is in user
+		# code.
+		regex="^0x[0-9a-f]{0,16}/0x[89a-f][0-9a-f]{15}/"
+	fi
+	if grep -q -E -m1 "$regex" $TMPDIR/perf.script; then
+		echo "Testing user branch stack sampling [Failed kernel address found in user mode]"
 		err=1
 	fi
 	# some branch types are still not being tested:
 	# IND COND_CALL COND_RET SYSRET SERROR NO_TX
+	if [ $err -eq 0 ]; then
+		echo "Testing user branch stack sampling [Passed]"
+		err=$start_err
+	else
+		echo "Testing user branch stack sampling [Failed]"
+	fi
 }
 
 test_trap_eret_branches() {
 	echo "Testing trap & eret branches"
-	if ! is_arm64; then
-		echo "skip: not arm64"
-	else
-		perf record -o $TMPDIR/perf.data --branch-filter any,save_type,u,k -- \
-			perf test -w traploop 1000
-		perf script -i $TMPDIR/perf.data --fields brstacksym | \
-			tr ' ' '\n' > $TMPDIR/perf.script
 
-		# BRBINF<n>.TYPE == TRAP are mapped to PERF_BR_IRQ by the BRBE driver
-		check_branches "^trap_bench\+[^ ]+/[^ ]/IRQ/"
-		check_branches "^[^ ]+/trap_bench\+[^ ]+/ERET/"
+	if ! is_arm64; then
+		echo "Testing trap & eret branches [Skipped not arm64]"
+		return
+	fi
+	start_err=$err
+	err=0
+	perf record -o $TMPDIR/perf.data --branch-filter any,save_type,u,k -- \
+		perf test -w traploop 1000 > "$TMPDIR/record.txt" 2>&1
+	perf script -i $TMPDIR/perf.data --fields brstacksym | \
+		tr ' ' '\n' > $TMPDIR/perf.script
+
+	# BRBINF<n>.TYPE == TRAP are mapped to PERF_BR_IRQ by the BRBE driver
+	check_branches "^trap_bench\+[^ ]+/[^ ]/IRQ/"
+	check_branches "^[^ ]+/trap_bench\+[^ ]+/ERET/"
+	if [ $err -eq 0 ]; then
+		echo "Testing trap & eret branches [Passed]"
+		err=$start_err
+	else
+		echo "Testing trap & eret branches [Failed]"
 	fi
 }
 
 test_kernel_branches() {
-	echo "Testing that k option only includes kernel source addresses"
+	echo "Testing kernel branch sampling"
 
-	if ! perf record --branch-filter any,k -o- -- true > /dev/null; then
-		echo "skip: not enough privileges"
+	if ! perf record --branch-filter any,k -o- -- true > "$TMPDIR/record.txt" 2>&1; then
+		echo "Testing that k option [Skipped not enough privileges]"
+		return
+	fi
+	start_err=$err
+	err=0
+	perf record -o $TMPDIR/perf.data --branch-filter any,k -- \
+		perf bench syscall basic --loop 1000 > "$TMPDIR/record.txt" 2>&1
+	perf script -i $TMPDIR/perf.data --fields brstack | \
+		tr ' ' '\n' > $TMPDIR/perf.script
+
+	# Example of branch entries:
+	#       "0xffffffff93bda241/0xffffffff93bda20f/M/-/-/..."
+	# Source addresses come first in user or kernel code. Next is the target
+	# address that must be in the kernel.
+
+	# Look for source addresses with top bit set
+	if ! grep -q -E -m1 "^0x[89a-f][0-9a-f]{15}" $TMPDIR/perf.script; then
+		echo "Testing kernel branch sampling [Failed kernel branches missing]"
+		err=1
+	fi
+	# Look for no target addresses without top bit set
+	if grep -q -E -m1 "^0x[0-9a-f]{0,16}/0x[0-7][0-9a-f]{1,15}/" $TMPDIR/perf.script; then
+		echo "Testing kernel branch sampling [Failed user branches found]"
+		err=1
+	fi
+	if [ $err -eq 0 ]; then
+		echo "Testing kernel branch sampling [Passed]"
+		err=$start_err
 	else
-		perf record -o $TMPDIR/perf.data --branch-filter any,k -- \
-			perf bench syscall basic --loop 1000
-		perf script -i $TMPDIR/perf.data --fields brstack | \
-			tr ' ' '\n' > $TMPDIR/perf.script
-
-		# Example of branch entries:
-		#       "0xffffffff93bda241/0xffffffff93bda20f/M/-/-/..."
-		# Source addresses come first and target address can be either
-		# userspace or kernel even with k option, as long as the source
-		# is in kernel.
-
-		#Look for source addresses with top bit set
-		if ! grep -E -m1 "^0x[89a-f][0-9a-f]{15}" $TMPDIR/perf.script; then
-			echo "ERROR: Kernel branches missing"
-			err=1
-		fi
-		# Look for no source addresses without top bit set
-		if grep -E -m1 "^0x[0-7][0-9a-f]{0,15}" $TMPDIR/perf.script; then
-			echo "ERROR: User branches found with kernel filter"
-			err=1
-		fi
+		echo "Testing kernel branch sampling [Failed]"
 	fi
 }
 
@@ -136,14 +171,15 @@ test_filter() {
 	test_filter_expect=$2
 
 	echo "Testing branch stack filtering permutation ($test_filter_filter,$test_filter_expect)"
-	perf record -o "$TMPDIR/perf.data" --branch-filter "$test_filter_filter,save_type,u" -- ${TESTPROG}  > "$TMPDIR/record.txt" 2>&1
+	perf record -o "$TMPDIR/perf.data" --branch-filter "$test_filter_filter,save_type,u" -- \
+		${TESTPROG}  > "$TMPDIR/record.txt" 2>&1
 	perf script -i "$TMPDIR/perf.data" --fields brstack > "$TMPDIR/perf.script"
 
 	# fail if we find any branch type that doesn't match any of the expected ones
 	# also consider UNKNOWN branch types (-)
 	if [ ! -s "$TMPDIR/perf.script" ]
 	then
-		echo "Empty script output"
+		echo "Testing branch stack filtering [Failed empty script output]"
 		err=1
 		return
 	fi
@@ -154,26 +190,36 @@ test_filter() {
 	  > "$TMPDIR/perf.script-filtered" || true
 	if [ -s "$TMPDIR/perf.script-filtered" ]
 	then
-		echo "Unexpected branch filter in script output"
+		echo "Testing branch stack filtering [Failed unexpected branch filter]"
 		cat "$TMPDIR/perf.script"
 		err=1
 		return
 	fi
+	echo "Testing branch stack filtering [Passed]"
 }
 
 test_syscall() {
 	echo "Testing syscalls"
 	# skip if perf doesn't have enough privileges
-	if ! perf record --branch-filter any,k -o- -- true > /dev/null; then
-		echo "skip: not enough privileges"
-	else
-		perf record -o $TMPDIR/perf.data --branch-filter \
-			any_call,save_type,u,k -c 10000 -- \
-			perf bench syscall basic --loop 1000
-		perf script -i $TMPDIR/perf.data --fields brstacksym | \
-			tr ' ' '\n' > $TMPDIR/perf.script
+	if ! perf record --branch-filter any,k -o- -- true > "$TMPDIR/record.txt" 2>&1; then
+		echo "Testing syscalls [Skipped: not enough privileges]"
+		return
+	fi
+	start_err=$err
+	err=0
+	perf record -o $TMPDIR/perf.data --branch-filter \
+		any_call,save_type,u,k -c 10007 -- \
+		perf bench syscall basic --loop 8000  > "$TMPDIR/record.txt" 2>&1
+	perf script -i $TMPDIR/perf.data --fields brstacksym | \
+		tr ' ' '\n' > $TMPDIR/perf.script
 
-		check_branches "getppid[^ ]*/SYSCALL/"
+	check_branches "getppid[^ ]*/SYSCALL/"
+
+	if [ $err -eq 0 ]; then
+		echo "Testing syscalls [Passed]"
+		err=$start_err
+	else
+		echo "Testing syscalls [Failed]"
 	fi
 }
 set -e

From a355eefc36c4481188249b067832b40a2c45fa5c Mon Sep 17 00:00:00 2001
From: Rong Bao <rong.bao@csmantle.top>
Date: Mon, 13 Apr 2026 18:03:55 +0800
Subject: [PATCH 130/131] perf annotate: Use jump__delete when freeing
 LoongArch jumps

Currently, the initialization of loongarch_jump_ops does not contain an
assignment to its .free field. This causes disasm_line__free() to fall
through to ins_ops__delete() for LoongArch jump instructions.

ins_ops__delete() will free ins_operands.source.raw and
ins_operands.source.name, and these fields overlaps with
ins_operands.jump.raw_comment and ins_operands.jump.raw_func_start.
Since in loongarch_jump__parse(), these two fields are populated by
strchr()-ing the same buffer, trying to free them will lead to undefined
behavior.

This invalid free usually leads to crashes:

        Process 1712902 (perf) of user 1000 dumped core.
        Stack trace of thread 1712902:
        #0  0x00007fffef155c58 n/a (libc.so.6 + 0x95c58)
        #1  0x00007fffef0f7a94 raise (libc.so.6 + 0x37a94)
        #2  0x00007fffef0dd6a8 abort (libc.so.6 + 0x1d6a8)
        #3  0x00007fffef145490 n/a (libc.so.6 + 0x85490)
        #4  0x00007fffef1646f4 n/a (libc.so.6 + 0xa46f4)
        #5  0x00007fffef164718 n/a (libc.so.6 + 0xa4718)
        #6  0x00005555583a6764 __zfree (/home/csmantle/dist/linux-arch/tools/perf/perf + 0x106764)
        #7  0x000055555854fb70 disasm_line__free (/home/csmantle/dist/linux-arch/tools/perf/perf + 0x2afb70)
        #8  0x000055555853d618 annotated_source__purge (/home/csmantle/dist/linux-arch/tools/perf/perf + 0x29d618)
        #9  0x000055555852300c __hist_entry__tui_annotate (/home/csmantle/dist/linux-arch/tools/perf/perf + 0x28300c)
        #10 0x0000555558526718 do_annotate (/home/csmantle/dist/linux-arch/tools/perf/perf + 0x286718)
        #11 0x000055555852ed94 evsel__hists_browse (/home/csmantle/dist/linux-arch/tools/perf/perf + 0x28ed94)
        #12 0x000055555831fdd0 cmd_report (/home/csmantle/dist/linux-arch/tools/perf/perf + 0x7fdd0)
        #13 0x000055555839b644 handle_internal_command (/home/csmantle/dist/linux-arch/tools/perf/perf + 0xfb644)
        #14 0x00005555582fe6ac main (/home/csmantle/dist/linux-arch/tools/perf/perf + 0x5e6ac)
        #15 0x00007fffef0ddd90 n/a (libc.so.6 + 0x1dd90)
        #16 0x00007fffef0ddf0c __libc_start_main (libc.so.6 + 0x1df0c)
        #17 0x00005555582fed10 _start (/home/csmantle/dist/linux-arch/tools/perf/perf + 0x5ed10)
        ELF object binary architecture: LoongArch

... and it can be confirmed with Valgrind:

        ==1721834== Invalid free() / delete / delete[] / realloc()
        ==1721834==    at 0x4EA9014: free (in /usr/lib/valgrind/vgpreload_memcheck-loongarch64-linux.so)
        ==1721834==    by 0x4106287: __zfree (zalloc.c:13)
        ==1721834==    by 0x42ADC8F: disasm_line__free (in /home/csmantle/dist/linux-arch/tools/perf/perf)
        ==1721834==    by 0x429B737: annotated_source__purge (in /home/csmantle/dist/linux-arch/tools/perf/perf)
        ==1721834==    by 0x42811EB: __hist_entry__tui_annotate (in /home/csmantle/dist/linux-arch/tools/perf/perf)
        ==1721834==    by 0x42848D7: do_annotate (in /home/csmantle/dist/linux-arch/tools/perf/perf)
        ==1721834==    by 0x428CF33: evsel__hists_browse (in /home/csmantle/dist/linux-arch/tools/perf/perf)
        ==1721834==  Address 0x7d34303 is 35 bytes inside a block of size 62 alloc'd
        ==1721834==    at 0x4EA59B8: malloc (in /usr/lib/valgrind/vgpreload_memcheck-loongarch64-linux.so)
        ==1721834==    by 0x6B80B6F: strdup (strdup.c:42)
        ==1721834==    by 0x42AD917: disasm_line__new (in /home/csmantle/dist/linux-arch/tools/perf/perf)
        ==1721834==    by 0x42AE5A3: symbol__disassemble_objdump (in /home/csmantle/dist/linux-arch/tools/perf/perf)
        ==1721834==    by 0x42AF0A7: symbol__disassemble (in /home/csmantle/dist/linux-arch/tools/perf/perf)
        ==1721834==    by 0x429B3CF: symbol__annotate (in /home/csmantle/dist/linux-arch/tools/perf/perf)
        ==1721834==    by 0x429C233: symbol__annotate2 (in /home/csmantle/dist/linux-arch/tools/perf/perf)
        ==1721834==    by 0x42804D3: __hist_entry__tui_annotate (in /home/csmantle/dist/linux-arch/tools/perf/perf)
        ==1721834==    by 0x42848D7: do_annotate (in /home/csmantle/dist/linux-arch/tools/perf/perf)
        ==1721834==    by 0x428CF33: evsel__hists_browse (in /home/csmantle/dist/linux-arch/tools/perf/perf)

This patch adds the missing free() specialization in loongarch_jump_ops,
which prevents disasm_line__free() from invoking the default cleanup
function.

Fixes: fb7fd2a14a503b9a ("perf annotate: Move raw_comment and raw_func_start fields out of 'struct ins_operands'")
Cc: stable@vger.kernel.org
Cc: WANG Rui <wangrui@loongson.cn>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: loongarch@lists.linux.dev
Signed-off-by: Rong Bao <rong.bao@csmantle.top>
Tested-by: WANG Rui <wangrui@loongson.cn>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/util/annotate-arch/annotate-loongarch.c | 1 +
 tools/perf/util/disasm.c                           | 2 +-
 tools/perf/util/disasm.h                           | 2 ++
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/annotate-arch/annotate-loongarch.c b/tools/perf/util/annotate-arch/annotate-loongarch.c
index 950f34e59e5c..c2addca77320 100644
--- a/tools/perf/util/annotate-arch/annotate-loongarch.c
+++ b/tools/perf/util/annotate-arch/annotate-loongarch.c
@@ -110,6 +110,7 @@ static int loongarch_jump__parse(const struct arch *arch, struct ins_operands *o
 }
 
 static const struct ins_ops loongarch_jump_ops = {
+	.free	   = jump__delete,
 	.parse	   = loongarch_jump__parse,
 	.scnprintf = jump__scnprintf,
 	.is_jump   = true,
diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c
index 4f5bd9153552..59ba88e1f744 100644
--- a/tools/perf/util/disasm.c
+++ b/tools/perf/util/disasm.c
@@ -452,7 +452,7 @@ int jump__scnprintf(const struct ins *ins, char *bf, size_t size,
 			 ops->target.offset);
 }
 
-static void jump__delete(struct ins_operands *ops __maybe_unused)
+void jump__delete(struct ins_operands *ops __maybe_unused)
 {
 	/*
 	 * The ops->jump.raw_comment and ops->jump.raw_func_start belong to the
diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h
index a6e478caf61a..25756e3f47e4 100644
--- a/tools/perf/util/disasm.h
+++ b/tools/perf/util/disasm.h
@@ -161,6 +161,8 @@ int jump__scnprintf(const struct ins *ins, char *bf, size_t size,
 int mov__scnprintf(const struct ins *ins, char *bf, size_t size,
 		   struct ins_operands *ops, int max_ins_name);
 
+void jump__delete(struct ins_operands *ops);
+
 int symbol__disassemble(struct symbol *sym, struct annotate_args *args);
 
 char *expand_tabs(char *line, char **storage, size_t *storage_len);

From 841dbf4871c57ce2da18c4ea7ffac5487d0eda16 Mon Sep 17 00:00:00 2001
From: WANG Rui <r@hev.cc>
Date: Tue, 14 Apr 2026 08:51:52 +0800
Subject: [PATCH 131/131] perf loongarch: Fix build failure with
 CONFIG_LIBDW_DWARF_UNWIND

Building perf for LoongArch fails when CONFIG_LIBDW_DWARF_UNWIND is
enabled because unwind-libdw.o is still referenced in
arch/loongarch/util/Build.

Fixes: e62fae9d9e8 ("perf unwind-libdw: Fix a cross-arch unwinding bug")
Signed-off-by: WANG Rui <r@hev.cc>
Acked-by: Huacai Chen <chenhuacai@loongson.cn>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
---
 tools/perf/arch/loongarch/util/Build | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/perf/arch/loongarch/util/Build b/tools/perf/arch/loongarch/util/Build
index 3ad73d0289f3..8d91e78d31c9 100644
--- a/tools/perf/arch/loongarch/util/Build
+++ b/tools/perf/arch/loongarch/util/Build
@@ -1,4 +1,3 @@
 perf-util-y += header.o
 
 perf-util-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o
-perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o