diff --git a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst index 7b7ba5700ca1..b8cfbb80be1c 100644 --- a/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst +++ b/Documentation/admin-guide/perf/nvidia-tegra410-pmu.rst @@ -6,6 +6,7 @@ The NVIDIA Tegra410 SoC includes various system PMUs to measure key performance metrics like memory bandwidth, latency, and utilization: * Unified Coherence Fabric (UCF) +* PCIE PMU Driver ---------- @@ -104,3 +105,165 @@ Example usage: destination filter = remote memory:: perf stat -a -e nvidia_ucf_pmu_1/event=0x0,src_loc_noncpu=0x1,dst_rem=0x1/ + +PCIE PMU +-------- + +This PMU is located in the SOC fabric connecting the PCIE root complex (RC) and +the memory subsystem. It monitors all read/write traffic from the root port(s) +or a particular BDF in a PCIE RC to local or remote memory. There is one PMU per +PCIE RC in the SoC. Each RC can have up to 16 lanes that can be bifurcated into +up to 8 root ports. The traffic from each root port can be filtered using RP or +BDF filter. For example, specifying "src_rp_mask=0xFF" means the PMU counter will +capture traffic from all RPs. Please see below for more details. + +The events and configuration options of this PMU device are described in sysfs, +see /sys/bus/event_source/devices/nvidia_pcie_pmu__rc_. + +The events in this PMU can be used to measure bandwidth, utilization, and +latency: + + * rd_req: count the number of read requests by PCIE device. + * wr_req: count the number of write requests by PCIE device. + * rd_bytes: count the number of bytes transferred by rd_req. + * wr_bytes: count the number of bytes transferred by wr_req. + * rd_cum_outs: count outstanding rd_req each cycle. + * cycles: count the clock cycles of SOC fabric connected to the PCIE interface. + +The average bandwidth is calculated as:: + + AVG_RD_BANDWIDTH_IN_GBPS = RD_BYTES / ELAPSED_TIME_IN_NS + AVG_WR_BANDWIDTH_IN_GBPS = WR_BYTES / ELAPSED_TIME_IN_NS + +The average request rate is calculated as:: + + AVG_RD_REQUEST_RATE = RD_REQ / CYCLES + AVG_WR_REQUEST_RATE = WR_REQ / CYCLES + + +The average latency is calculated as:: + + FREQ_IN_GHZ = CYCLES / ELAPSED_TIME_IN_NS + AVG_LATENCY_IN_CYCLES = RD_CUM_OUTS / RD_REQ + AVERAGE_LATENCY_IN_NS = AVG_LATENCY_IN_CYCLES / FREQ_IN_GHZ + +The PMU events can be filtered based on the traffic source and destination. +The source filter indicates the PCIE devices that will be monitored. The +destination filter specifies the destination memory type, e.g. local system +memory (CMEM), local GPU memory (GMEM), or remote memory. The local/remote +classification of the destination filter is based on the home socket of the +address, not where the data actually resides. These filters can be found in +/sys/bus/event_source/devices/nvidia_pcie_pmu__rc_/format/. + +The list of event filters: + +* Source filter: + + * src_rp_mask: bitmask of root ports that will be monitored. Each bit in this + bitmask represents the RP index in the RC. If the bit is set, all devices under + the associated RP will be monitored. E.g "src_rp_mask=0xF" will monitor + devices in root port 0 to 3. + * src_bdf: the BDF that will be monitored. This is a 16-bit value that + follows formula: (bus << 8) + (device << 3) + (function). For example, the + value of BDF 27:01.1 is 0x2781. + * src_bdf_en: enable the BDF filter. If this is set, the BDF filter value in + "src_bdf" is used to filter the traffic. + + Note that Root-Port and BDF filters are mutually exclusive and the PMU in + each RC can only have one BDF filter for the whole counters. If BDF filter + is enabled, the BDF filter value will be applied to all events. + +* Destination filter: + + * dst_loc_cmem: if set, count events to local system memory (CMEM) address + * dst_loc_gmem: if set, count events to local GPU memory (GMEM) address + * dst_loc_pcie_p2p: if set, count events to local PCIE peer address + * dst_loc_pcie_cxl: if set, count events to local CXL memory address + * dst_rem: if set, count events to remote memory address + +If the source filter is not specified, the PMU will count events from all root +ports. If the destination filter is not specified, the PMU will count events +to all destinations. + +Example usage: + +* Count event id 0x0 from root port 0 of PCIE RC-0 on socket 0 targeting all + destinations:: + + perf stat -a -e nvidia_pcie_pmu_0_rc_0/event=0x0,src_rp_mask=0x1/ + +* Count event id 0x1 from root port 0 and 1 of PCIE RC-1 on socket 0 and + targeting just local CMEM of socket 0:: + + perf stat -a -e nvidia_pcie_pmu_0_rc_1/event=0x1,src_rp_mask=0x3,dst_loc_cmem=0x1/ + +* Count event id 0x2 from root port 0 of PCIE RC-2 on socket 1 targeting all + destinations:: + + perf stat -a -e nvidia_pcie_pmu_1_rc_2/event=0x2,src_rp_mask=0x1/ + +* Count event id 0x3 from root port 0 and 1 of PCIE RC-3 on socket 1 and + targeting just local CMEM of socket 1:: + + perf stat -a -e nvidia_pcie_pmu_1_rc_3/event=0x3,src_rp_mask=0x3,dst_loc_cmem=0x1/ + +* Count event id 0x4 from BDF 01:01.0 of PCIE RC-4 on socket 0 targeting all + destinations:: + + perf stat -a -e nvidia_pcie_pmu_0_rc_4/event=0x4,src_bdf=0x0180,src_bdf_en=0x1/ + +Mapping the RC# to lspci segment number can be non-trivial; hence a new NVIDIA +Designated Vendor Specific Capability (DVSEC) register is added into the PCIE config space +for each RP. This DVSEC has vendor id "10de" and DVSEC id of "0x4". The DVSEC register +contains the following information to map PCIE devices under the RP back to its RC# : + + - Bus# (byte 0xc) : bus number as reported by the lspci output + - Segment# (byte 0xd) : segment number as reported by the lspci output + - RP# (byte 0xe) : port number as reported by LnkCap attribute from lspci for a device with Root Port capability + - RC# (byte 0xf): root complex number associated with the RP + - Socket# (byte 0x10): socket number associated with the RP + +Example script for mapping lspci BDF to RC# and socket#:: + + #!/bin/bash + while read bdf rest; do + dvsec4_reg=$(lspci -vv -s $bdf | awk ' + /Designated Vendor-Specific: Vendor=10de ID=0004/ { + match($0, /\[([0-9a-fA-F]+)/, arr); + print "0x" arr[1]; + exit + } + ') + if [ -n "$dvsec4_reg" ]; then + bus=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xc))).b) + segment=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xd))).b) + rp=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xe))).b) + rc=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0xf))).b) + socket=$(setpci -s $bdf $(printf '0x%x' $((${dvsec4_reg} + 0x10))).b) + echo "$bdf: Bus=$bus, Segment=$segment, RP=$rp, RC=$rc, Socket=$socket" + fi + done < <(lspci -d 10de:) + +Example output:: + + 0001:00:00.0: Bus=00, Segment=01, RP=00, RC=00, Socket=00 + 0002:80:00.0: Bus=80, Segment=02, RP=01, RC=01, Socket=00 + 0002:a0:00.0: Bus=a0, Segment=02, RP=02, RC=01, Socket=00 + 0002:c0:00.0: Bus=c0, Segment=02, RP=03, RC=01, Socket=00 + 0002:e0:00.0: Bus=e0, Segment=02, RP=04, RC=01, Socket=00 + 0003:00:00.0: Bus=00, Segment=03, RP=00, RC=02, Socket=00 + 0004:00:00.0: Bus=00, Segment=04, RP=00, RC=03, Socket=00 + 0005:00:00.0: Bus=00, Segment=05, RP=00, RC=04, Socket=00 + 0005:40:00.0: Bus=40, Segment=05, RP=01, RC=04, Socket=00 + 0005:c0:00.0: Bus=c0, Segment=05, RP=02, RC=04, Socket=00 + 0006:00:00.0: Bus=00, Segment=06, RP=00, RC=05, Socket=00 + 0009:00:00.0: Bus=00, Segment=09, RP=00, RC=00, Socket=01 + 000a:80:00.0: Bus=80, Segment=0a, RP=01, RC=01, Socket=01 + 000a:a0:00.0: Bus=a0, Segment=0a, RP=02, RC=01, Socket=01 + 000a:e0:00.0: Bus=e0, Segment=0a, RP=03, RC=01, Socket=01 + 000b:00:00.0: Bus=00, Segment=0b, RP=00, RC=02, Socket=01 + 000c:00:00.0: Bus=00, Segment=0c, RP=00, RC=03, Socket=01 + 000d:00:00.0: Bus=00, Segment=0d, RP=00, RC=04, Socket=01 + 000d:40:00.0: Bus=40, Segment=0d, RP=01, RC=04, Socket=01 + 000d:c0:00.0: Bus=c0, Segment=0d, RP=02, RC=04, Socket=01 + 000e:00:00.0: Bus=00, Segment=0e, RP=00, RC=05, Socket=01 diff --git a/drivers/perf/arm_cspmu/nvidia_cspmu.c b/drivers/perf/arm_cspmu/nvidia_cspmu.c index 8e37cbe3bae9..61fde84ea343 100644 --- a/drivers/perf/arm_cspmu/nvidia_cspmu.c +++ b/drivers/perf/arm_cspmu/nvidia_cspmu.c @@ -8,6 +8,7 @@ #include #include +#include #include #include "arm_cspmu.h" @@ -28,6 +29,19 @@ #define NV_UCF_FILTER_DST GENMASK_ULL(11, 8) #define NV_UCF_FILTER_DEFAULT (NV_UCF_FILTER_SRC | NV_UCF_FILTER_DST) +#define NV_PCIE_V2_PORT_COUNT 8ULL +#define NV_PCIE_V2_FILTER_ID_MASK GENMASK_ULL(24, 0) +#define NV_PCIE_V2_FILTER_PORT GENMASK_ULL(NV_PCIE_V2_PORT_COUNT - 1, 0) +#define NV_PCIE_V2_FILTER_BDF_VAL GENMASK_ULL(23, NV_PCIE_V2_PORT_COUNT) +#define NV_PCIE_V2_FILTER_BDF_EN BIT(24) +#define NV_PCIE_V2_FILTER_BDF_VAL_EN GENMASK_ULL(24, NV_PCIE_V2_PORT_COUNT) +#define NV_PCIE_V2_FILTER_DEFAULT NV_PCIE_V2_FILTER_PORT + +#define NV_PCIE_V2_DST_COUNT 5ULL +#define NV_PCIE_V2_FILTER2_ID_MASK GENMASK_ULL(4, 0) +#define NV_PCIE_V2_FILTER2_DST GENMASK_ULL(NV_PCIE_V2_DST_COUNT - 1, 0) +#define NV_PCIE_V2_FILTER2_DEFAULT NV_PCIE_V2_FILTER2_DST + #define NV_GENERIC_FILTER_ID_MASK GENMASK_ULL(31, 0) #define NV_PRODID_MASK (PMIIDR_PRODUCTID | PMIIDR_VARIANT | PMIIDR_REVISION) @@ -161,6 +175,16 @@ static struct attribute *ucf_pmu_event_attrs[] = { NULL }; +static struct attribute *pcie_v2_pmu_event_attrs[] = { + ARM_CSPMU_EVENT_ATTR(rd_bytes, 0x0), + ARM_CSPMU_EVENT_ATTR(wr_bytes, 0x1), + ARM_CSPMU_EVENT_ATTR(rd_req, 0x2), + ARM_CSPMU_EVENT_ATTR(wr_req, 0x3), + ARM_CSPMU_EVENT_ATTR(rd_cum_outs, 0x4), + ARM_CSPMU_EVENT_ATTR(cycles, ARM_CSPMU_EVT_CYCLES_DEFAULT), + NULL +}; + static struct attribute *generic_pmu_event_attrs[] = { ARM_CSPMU_EVENT_ATTR(cycles, ARM_CSPMU_EVT_CYCLES_DEFAULT), NULL, @@ -201,6 +225,19 @@ static struct attribute *ucf_pmu_format_attrs[] = { NULL }; +static struct attribute *pcie_v2_pmu_format_attrs[] = { + ARM_CSPMU_FORMAT_EVENT_ATTR, + ARM_CSPMU_FORMAT_ATTR(src_rp_mask, "config1:0-7"), + ARM_CSPMU_FORMAT_ATTR(src_bdf, "config1:8-23"), + ARM_CSPMU_FORMAT_ATTR(src_bdf_en, "config1:24"), + ARM_CSPMU_FORMAT_ATTR(dst_loc_cmem, "config2:0"), + ARM_CSPMU_FORMAT_ATTR(dst_loc_gmem, "config2:1"), + ARM_CSPMU_FORMAT_ATTR(dst_loc_pcie_p2p, "config2:2"), + ARM_CSPMU_FORMAT_ATTR(dst_loc_pcie_cxl, "config2:3"), + ARM_CSPMU_FORMAT_ATTR(dst_rem, "config2:4"), + NULL +}; + static struct attribute *generic_pmu_format_attrs[] = { ARM_CSPMU_FORMAT_EVENT_ATTR, ARM_CSPMU_FORMAT_FILTER_ATTR, @@ -232,6 +269,32 @@ nv_cspmu_get_name(const struct arm_cspmu *cspmu) return ctx->name; } +#if defined(CONFIG_ACPI) && defined(CONFIG_ARM64) +static int nv_cspmu_get_inst_id(const struct arm_cspmu *cspmu, u32 *id) +{ + struct fwnode_handle *fwnode; + struct acpi_device *adev; + int ret; + + adev = arm_cspmu_acpi_dev_get(cspmu); + if (!adev) + return -ENODEV; + + fwnode = acpi_fwnode_handle(adev); + ret = fwnode_property_read_u32(fwnode, "instance_id", id); + if (ret) + dev_err(cspmu->dev, "Failed to get instance ID\n"); + + acpi_dev_put(adev); + return ret; +} +#else +static int nv_cspmu_get_inst_id(const struct arm_cspmu *cspmu, u32 *id) +{ + return -EINVAL; +} +#endif + static u32 nv_cspmu_event_filter(const struct perf_event *event) { const struct nv_cspmu_ctx *ctx = @@ -277,6 +340,20 @@ static void nv_cspmu_set_ev_filter(struct arm_cspmu *cspmu, } } +static void nv_cspmu_reset_ev_filter(struct arm_cspmu *cspmu, + const struct perf_event *event) +{ + const struct nv_cspmu_ctx *ctx = + to_nv_cspmu_ctx(to_arm_cspmu(event->pmu)); + const u32 offset = 4 * event->hw.idx; + + if (ctx->get_filter) + writel(0, cspmu->base0 + PMEVFILTR + offset); + + if (ctx->get_filter2) + writel(0, cspmu->base0 + PMEVFILT2R + offset); +} + static void nv_cspmu_set_cc_filter(struct arm_cspmu *cspmu, const struct perf_event *event) { @@ -307,9 +384,103 @@ static u32 ucf_pmu_event_filter(const struct perf_event *event) return ret; } +static u32 pcie_v2_pmu_bdf_val_en(u32 filter) +{ + const u32 bdf_en = FIELD_GET(NV_PCIE_V2_FILTER_BDF_EN, filter); + + /* Returns both BDF value and enable bit if BDF filtering is enabled. */ + if (bdf_en) + return FIELD_GET(NV_PCIE_V2_FILTER_BDF_VAL_EN, filter); + + /* Ignore the BDF value if BDF filter is not enabled. */ + return 0; +} + +static u32 pcie_v2_pmu_event_filter(const struct perf_event *event) +{ + u32 filter, lead_filter, lead_bdf; + struct perf_event *leader; + const struct nv_cspmu_ctx *ctx = + to_nv_cspmu_ctx(to_arm_cspmu(event->pmu)); + + filter = event->attr.config1 & ctx->filter_mask; + if (filter != 0) + return filter; + + leader = event->group_leader; + + /* Use leader's filter value if its BDF filtering is enabled. */ + if (event != leader) { + lead_filter = pcie_v2_pmu_event_filter(leader); + lead_bdf = pcie_v2_pmu_bdf_val_en(lead_filter); + if (lead_bdf != 0) + return lead_filter; + } + + /* Otherwise, return default filter value. */ + return ctx->filter_default_val; +} + +static int pcie_v2_pmu_validate_event(struct arm_cspmu *cspmu, + struct perf_event *new_ev) +{ + /* + * Make sure the events are using same BDF filter since the PCIE-SRC PMU + * only supports one common BDF filter setting for all of the counters. + */ + + int idx; + u32 new_filter, new_rp, new_bdf, new_lead_filter, new_lead_bdf; + struct perf_event *new_leader; + + if (cspmu->impl.ops.is_cycle_counter_event(new_ev)) + return 0; + + new_leader = new_ev->group_leader; + + new_filter = pcie_v2_pmu_event_filter(new_ev); + new_lead_filter = pcie_v2_pmu_event_filter(new_leader); + + new_bdf = pcie_v2_pmu_bdf_val_en(new_filter); + new_lead_bdf = pcie_v2_pmu_bdf_val_en(new_lead_filter); + + new_rp = FIELD_GET(NV_PCIE_V2_FILTER_PORT, new_filter); + + if (new_rp != 0 && new_bdf != 0) { + dev_err(cspmu->dev, + "RP and BDF filtering are mutually exclusive\n"); + return -EINVAL; + } + + if (new_bdf != new_lead_bdf) { + dev_err(cspmu->dev, + "sibling and leader BDF value should be equal\n"); + return -EINVAL; + } + + /* Compare BDF filter on existing events. */ + idx = find_first_bit(cspmu->hw_events.used_ctrs, + cspmu->cycle_counter_logical_idx); + + if (idx != cspmu->cycle_counter_logical_idx) { + struct perf_event *leader = cspmu->hw_events.events[idx]->group_leader; + + const u32 lead_filter = pcie_v2_pmu_event_filter(leader); + const u32 lead_bdf = pcie_v2_pmu_bdf_val_en(lead_filter); + + if (new_lead_bdf != lead_bdf) { + dev_err(cspmu->dev, "only one BDF value is supported\n"); + return -EINVAL; + } + } + + return 0; +} + enum nv_cspmu_name_fmt { NAME_FMT_GENERIC, - NAME_FMT_SOCKET + NAME_FMT_SOCKET, + NAME_FMT_SOCKET_INST, }; struct nv_cspmu_match { @@ -427,6 +598,26 @@ static const struct nv_cspmu_match nv_cspmu_match[] = { .get_filter = ucf_pmu_event_filter, }, }, + { + .prodid = 0x10301000, + .prodid_mask = NV_PRODID_MASK, + .name_pattern = "nvidia_pcie_pmu_%u_rc_%u", + .name_fmt = NAME_FMT_SOCKET_INST, + .template_ctx = { + .event_attr = pcie_v2_pmu_event_attrs, + .format_attr = pcie_v2_pmu_format_attrs, + .filter_mask = NV_PCIE_V2_FILTER_ID_MASK, + .filter_default_val = NV_PCIE_V2_FILTER_DEFAULT, + .filter2_mask = NV_PCIE_V2_FILTER2_ID_MASK, + .filter2_default_val = NV_PCIE_V2_FILTER2_DEFAULT, + .get_filter = pcie_v2_pmu_event_filter, + .get_filter2 = nv_cspmu_event_filter2, + }, + .ops = { + .validate_event = pcie_v2_pmu_validate_event, + .reset_ev_filter = nv_cspmu_reset_ev_filter, + } + }, { .prodid = 0, .prodid_mask = 0, @@ -450,7 +641,7 @@ static const struct nv_cspmu_match nv_cspmu_match[] = { static char *nv_cspmu_format_name(const struct arm_cspmu *cspmu, const struct nv_cspmu_match *match) { - char *name; + char *name = NULL; struct device *dev = cspmu->dev; static atomic_t pmu_generic_idx = {0}; @@ -464,13 +655,20 @@ static char *nv_cspmu_format_name(const struct arm_cspmu *cspmu, socket); break; } + case NAME_FMT_SOCKET_INST: { + const int cpu = cpumask_first(&cspmu->associated_cpus); + const int socket = cpu_to_node(cpu); + u32 inst_id; + + if (!nv_cspmu_get_inst_id(cspmu, &inst_id)) + name = devm_kasprintf(dev, GFP_KERNEL, + match->name_pattern, socket, inst_id); + break; + } case NAME_FMT_GENERIC: name = devm_kasprintf(dev, GFP_KERNEL, match->name_pattern, atomic_fetch_inc(&pmu_generic_idx)); break; - default: - name = NULL; - break; } return name; @@ -511,8 +709,10 @@ static int nv_cspmu_init_ops(struct arm_cspmu *cspmu) cspmu->impl.ctx = ctx; /* NVIDIA specific callbacks. */ + SET_OP(validate_event, impl_ops, match, NULL); SET_OP(set_cc_filter, impl_ops, match, nv_cspmu_set_cc_filter); SET_OP(set_ev_filter, impl_ops, match, nv_cspmu_set_ev_filter); + SET_OP(reset_ev_filter, impl_ops, match, NULL); SET_OP(get_event_attrs, impl_ops, match, nv_cspmu_get_event_attrs); SET_OP(get_format_attrs, impl_ops, match, nv_cspmu_get_format_attrs); SET_OP(get_name, impl_ops, match, nv_cspmu_get_name);