From 8d72eba1cf8cecd76a2b4c1dd7673c2dc775f514 Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Tue, 30 Jul 2024 04:49:18 +0000 Subject: [PATCH 1/2] perf/x86/rapl: Fix the energy-pkg event for AMD CPUs After commit: 63edbaa48a57 ("x86/cpu/topology: Add support for the AMD 0x80000026 leaf") ... on AMD processors that support extended CPUID leaf 0x80000026, the topology_die_cpumask() and topology_logical_die_id() macros no longer return the package cpumask and package ID, instead they return the CCD (Core Complex Die) mask and ID respectively. This leads to the energy-pkg event scope to be modified to CCD instead of package. So, change the PMU scope for AMD and Hygon back to package. On a 12 CCD 1 Package AMD Zen4 Genoa machine: Before: $ cat /sys/devices/power/cpumask 0,8,16,24,32,40,48,56,64,72,80,88. The expected cpumask here is supposed to be just "0", as it is a package scope event, only one CPU will be collecting the event for all the CPUs in the package. After: $ cat /sys/devices/power/cpumask 0 [ mingo: Cleaned up the changelog ] Signed-off-by: Dhananjay Ugwekar Signed-off-by: Ingo Molnar Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20240904100934.3260-1-Dhananjay.Ugwekar@amd.com --- arch/x86/events/rapl.c | 47 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index b985ca79cf97..a481a939862e 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -103,6 +103,19 @@ static struct perf_pmu_events_attr event_attr_##v = { \ .event_str = str, \ }; +/* + * RAPL Package energy counter scope: + * 1. AMD/HYGON platforms have a per-PKG package energy counter + * 2. For Intel platforms + * 2.1. CLX-AP is multi-die and its RAPL MSRs are die-scope + * 2.2. Other Intel platforms are single die systems so the scope can be + * considered as either pkg-scope or die-scope, and we are considering + * them as die-scope. + */ +#define rapl_pmu_is_pkg_scope() \ + (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \ + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) + struct rapl_pmu { raw_spinlock_t lock; int n_active; @@ -140,9 +153,25 @@ static unsigned int rapl_cntr_mask; static u64 rapl_timer_ms; static struct perf_msr *rapl_msrs; +/* + * Helper functions to get the correct topology macros according to the + * RAPL PMU scope. + */ +static inline unsigned int get_rapl_pmu_idx(int cpu) +{ + return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) : + topology_logical_die_id(cpu); +} + +static inline const struct cpumask *get_rapl_pmu_cpumask(int cpu) +{ + return rapl_pmu_is_pkg_scope() ? topology_core_cpumask(cpu) : + topology_die_cpumask(cpu); +} + static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu) { - unsigned int rapl_pmu_idx = topology_logical_die_id(cpu); + unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu); /* * The unsigned check also catches the '-1' return value for non @@ -552,7 +581,7 @@ static int rapl_cpu_offline(unsigned int cpu) pmu->cpu = -1; /* Find a new cpu to collect rapl events */ - target = cpumask_any_but(topology_die_cpumask(cpu), cpu); + target = cpumask_any_but(get_rapl_pmu_cpumask(cpu), cpu); /* Migrate rapl events to the new target */ if (target < nr_cpu_ids) { @@ -565,6 +594,11 @@ static int rapl_cpu_offline(unsigned int cpu) static int rapl_cpu_online(unsigned int cpu) { + s32 rapl_pmu_idx = get_rapl_pmu_idx(cpu); + if (rapl_pmu_idx < 0) { + pr_err("topology_logical_(package/die)_id() returned a negative value"); + return -EINVAL; + } struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); int target; @@ -579,14 +613,14 @@ static int rapl_cpu_online(unsigned int cpu) pmu->timer_interval = ms_to_ktime(rapl_timer_ms); rapl_hrtimer_init(pmu); - rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu; + rapl_pmus->pmus[rapl_pmu_idx] = pmu; } /* * Check if there is an online cpu in the package which collects rapl * events already. */ - target = cpumask_any_and(&rapl_cpu_mask, topology_die_cpumask(cpu)); + target = cpumask_any_and(&rapl_cpu_mask, get_rapl_pmu_cpumask(cpu)); if (target < nr_cpu_ids) return 0; @@ -675,7 +709,10 @@ static const struct attribute_group *rapl_attr_update[] = { static int __init init_rapl_pmus(void) { - int nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package(); + int nr_rapl_pmu = topology_max_packages(); + + if (!rapl_pmu_is_pkg_scope()) + nr_rapl_pmu *= topology_max_dies_per_package(); rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL); if (!rapl_pmus) From ef493f4b122d6b14a6de111d1acac1eab1d673b0 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 9 Sep 2024 08:58:48 -0700 Subject: [PATCH 2/2] perf/x86/intel: Allow to setup LBR for counting event for BPF The BPF subsystem may capture LBR data on a counting event. However, the current implementation assumes that LBR can/should only be used with sampling events. For instance, retsnoop tool ([0]) makes an extensive use of this functionality and sets up perf event as follows: struct perf_event_attr attr; memset(&attr, 0, sizeof(attr)); attr.size = sizeof(attr); attr.type = PERF_TYPE_HARDWARE; attr.config = PERF_COUNT_HW_CPU_CYCLES; attr.sample_type = PERF_SAMPLE_BRANCH_STACK; attr.branch_sample_type = PERF_SAMPLE_BRANCH_KERNEL; To limit the LBR for a sampling event is to avoid unnecessary branch stack setup for a counting event in the sample read. Because LBR is only read in the sampling event's overflow. Although in most cases LBR is used in sampling, there is no HW limit to bind LBR to the sampling mode. Allow an LBR setup for a counting event unless in the sample read mode. Fixes: 85846b27072d ("perf/x86: Add PERF_X86_EVENT_NEEDS_BRANCH_STACK flag") Closes: https://lore.kernel.org/lkml/20240905180055.1221620-1-andrii@kernel.org/ Reported-by: Andrii Nakryiko Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Tested-by: Andrii Nakryiko Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20240909155848.326640-1-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 9e519d8a810a..d879478db3f5 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3972,8 +3972,12 @@ static int intel_pmu_hw_config(struct perf_event *event) x86_pmu.pebs_aliases(event); } - if (needs_branch_stack(event) && is_sampling_event(event)) - event->hw.flags |= PERF_X86_EVENT_NEEDS_BRANCH_STACK; + if (needs_branch_stack(event)) { + /* Avoid branch stack setup for counting events in SAMPLE READ */ + if (is_sampling_event(event) || + !(event->attr.sample_type & PERF_SAMPLE_READ)) + event->hw.flags |= PERF_X86_EVENT_NEEDS_BRANCH_STACK; + } if (branch_sample_counters(event)) { struct perf_event *leader, *sibling;