2008-12-03 17:39:53 +08:00
|
|
|
/*
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
* Performance events x86 architecture code
|
2008-12-03 17:39:53 +08:00
|
|
|
*
|
2009-04-29 20:52:50 +08:00
|
|
|
* Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
|
|
|
|
* Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
|
|
|
|
* Copyright (C) 2009 Jaswinder Singh Rajput
|
|
|
|
* Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
|
2015-11-16 18:08:45 +08:00
|
|
|
* Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
|
2009-07-21 21:56:48 +08:00
|
|
|
* Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
|
2010-01-18 16:58:01 +08:00
|
|
|
* Copyright (C) 2009 Google, Inc., Stephane Eranian
|
2008-12-03 17:39:53 +08:00
|
|
|
*
|
|
|
|
* For licencing details see kernel-base/COPYING
|
|
|
|
*/
|
|
|
|
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
#include <linux/perf_event.h>
|
2008-12-03 17:39:53 +08:00
|
|
|
#include <linux/capability.h>
|
|
|
|
#include <linux/notifier.h>
|
|
|
|
#include <linux/hardirq.h>
|
|
|
|
#include <linux/kprobes.h>
|
2016-07-14 08:19:01 +08:00
|
|
|
#include <linux/export.h>
|
|
|
|
#include <linux/init.h>
|
2008-12-03 17:39:53 +08:00
|
|
|
#include <linux/kdebug.h>
|
2017-02-04 07:16:44 +08:00
|
|
|
#include <linux/sched/mm.h>
|
2017-02-01 23:36:40 +08:00
|
|
|
#include <linux/sched/clock.h>
|
2009-03-31 01:07:15 +08:00
|
|
|
#include <linux/uaccess.h>
|
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
|
|
|
#include <linux/slab.h>
|
2009-07-21 21:56:48 +08:00
|
|
|
#include <linux/cpu.h>
|
2010-01-22 23:32:17 +08:00
|
|
|
#include <linux/bitops.h>
|
2011-11-21 06:30:47 +08:00
|
|
|
#include <linux/device.h>
|
2018-04-20 20:08:58 +08:00
|
|
|
#include <linux/nospec.h>
|
2020-08-18 21:57:53 +08:00
|
|
|
#include <linux/static_call.h>
|
2008-12-03 17:39:53 +08:00
|
|
|
|
|
|
|
#include <asm/apic.h>
|
2009-03-31 01:07:15 +08:00
|
|
|
#include <asm/stacktrace.h>
|
2009-03-31 01:07:16 +08:00
|
|
|
#include <asm/nmi.h>
|
2011-03-03 10:34:50 +08:00
|
|
|
#include <asm/smp.h>
|
2011-04-16 08:27:55 +08:00
|
|
|
#include <asm/alternative.h>
|
2014-10-25 06:58:12 +08:00
|
|
|
#include <asm/mmu_context.h>
|
2014-10-25 06:58:07 +08:00
|
|
|
#include <asm/tlbflush.h>
|
2011-11-21 18:43:53 +08:00
|
|
|
#include <asm/timer.h>
|
2012-07-10 15:42:15 +08:00
|
|
|
#include <asm/desc.h>
|
|
|
|
#include <asm/ldt.h>
|
2016-09-17 03:18:13 +08:00
|
|
|
#include <asm/unwind.h>
|
2008-12-03 17:39:53 +08:00
|
|
|
|
2016-02-10 17:55:23 +08:00
|
|
|
#include "perf_event.h"
|
2011-08-31 07:41:05 +08:00
|
|
|
|
|
|
|
struct x86_pmu x86_pmu __read_mostly;
|
2021-04-12 22:30:43 +08:00
|
|
|
static struct pmu pmu;
|
2011-06-06 22:57:03 +08:00
|
|
|
|
2011-08-31 07:41:05 +08:00
|
|
|
DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
|
2009-03-06 01:08:27 +08:00
|
|
|
.enabled = 1,
|
2021-04-12 22:30:43 +08:00
|
|
|
.pmu = &pmu,
|
2009-03-06 01:08:27 +08:00
|
|
|
};
|
2008-12-03 17:39:53 +08:00
|
|
|
|
2019-11-25 13:48:38 +08:00
|
|
|
DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key);
|
2018-03-27 05:09:27 +08:00
|
|
|
DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key);
|
perf/x86/intel: Hybrid PMU support for perf capabilities
Some platforms, e.g. Alder Lake, have hybrid architecture. Although most
PMU capabilities are the same, there are still some unique PMU
capabilities for different hybrid PMUs. Perf should register a dedicated
pmu for each hybrid PMU.
Add a new struct x86_hybrid_pmu, which saves the dedicated pmu and
capabilities for each hybrid PMU.
The architecture MSR, MSR_IA32_PERF_CAPABILITIES, only indicates the
architecture features which are available on all hybrid PMUs. The
architecture features are stored in the global x86_pmu.intel_cap.
For Alder Lake, the model-specific features are perf metrics and
PEBS-via-PT. The corresponding bits of the global x86_pmu.intel_cap
should be 0 for these two features. Perf should not use the global
intel_cap to check the features on a hybrid system.
Add a dedicated intel_cap in the x86_hybrid_pmu to store the
model-specific capabilities. Use the dedicated intel_cap to replace
the global intel_cap for thse two features. The dedicated intel_cap
will be set in the following "Add Alder Lake Hybrid support" patch.
Add is_hybrid() to distinguish a hybrid system. ADL may have an
alternative configuration. With that configuration, the
X86_FEATURE_HYBRID_CPU is not set. Perf cannot rely on the feature bit.
Add a new static_key_false, perf_is_hybrid, to indicate a hybrid system.
It will be assigned in the following "Add Alder Lake Hybrid support"
patch as well.
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1618237865-33448-5-git-send-email-kan.liang@linux.intel.com
2021-04-12 22:30:44 +08:00
|
|
|
DEFINE_STATIC_KEY_FALSE(perf_is_hybrid);
|
2014-10-25 06:58:13 +08:00
|
|
|
|
2020-08-18 21:57:53 +08:00
|
|
|
/*
|
|
|
|
* This here uses DEFINE_STATIC_CALL_NULL() to get a static_call defined
|
|
|
|
* from just a typename, as opposed to an actual function.
|
|
|
|
*/
|
|
|
|
DEFINE_STATIC_CALL_NULL(x86_pmu_handle_irq, *x86_pmu.handle_irq);
|
|
|
|
DEFINE_STATIC_CALL_NULL(x86_pmu_disable_all, *x86_pmu.disable_all);
|
|
|
|
DEFINE_STATIC_CALL_NULL(x86_pmu_enable_all, *x86_pmu.enable_all);
|
|
|
|
DEFINE_STATIC_CALL_NULL(x86_pmu_enable, *x86_pmu.enable);
|
|
|
|
DEFINE_STATIC_CALL_NULL(x86_pmu_disable, *x86_pmu.disable);
|
|
|
|
|
2021-09-08 00:39:01 +08:00
|
|
|
DEFINE_STATIC_CALL_NULL(x86_pmu_assign, *x86_pmu.assign);
|
|
|
|
|
2020-08-18 21:57:53 +08:00
|
|
|
DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add);
|
|
|
|
DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del);
|
|
|
|
DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read);
|
|
|
|
|
2022-05-11 03:28:18 +08:00
|
|
|
DEFINE_STATIC_CALL_NULL(x86_pmu_set_period, *x86_pmu.set_period);
|
|
|
|
DEFINE_STATIC_CALL_NULL(x86_pmu_update, *x86_pmu.update);
|
|
|
|
DEFINE_STATIC_CALL_NULL(x86_pmu_limit_period, *x86_pmu.limit_period);
|
2022-05-11 03:27:22 +08:00
|
|
|
|
2020-08-18 21:57:53 +08:00
|
|
|
DEFINE_STATIC_CALL_NULL(x86_pmu_schedule_events, *x86_pmu.schedule_events);
|
|
|
|
DEFINE_STATIC_CALL_NULL(x86_pmu_get_event_constraints, *x86_pmu.get_event_constraints);
|
|
|
|
DEFINE_STATIC_CALL_NULL(x86_pmu_put_event_constraints, *x86_pmu.put_event_constraints);
|
|
|
|
|
|
|
|
DEFINE_STATIC_CALL_NULL(x86_pmu_start_scheduling, *x86_pmu.start_scheduling);
|
|
|
|
DEFINE_STATIC_CALL_NULL(x86_pmu_commit_scheduling, *x86_pmu.commit_scheduling);
|
|
|
|
DEFINE_STATIC_CALL_NULL(x86_pmu_stop_scheduling, *x86_pmu.stop_scheduling);
|
|
|
|
|
|
|
|
DEFINE_STATIC_CALL_NULL(x86_pmu_sched_task, *x86_pmu.sched_task);
|
|
|
|
DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx);
|
|
|
|
|
|
|
|
DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs);
|
|
|
|
DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases);
|
|
|
|
|
perf: Rewrite core context handling
There have been various issues and limitations with the way perf uses
(task) contexts to track events. Most notable is the single hardware
PMU task context, which has resulted in a number of yucky things (both
proposed and merged).
Notably:
- HW breakpoint PMU
- ARM big.little PMU / Intel ADL PMU
- Intel Branch Monitoring PMU
- AMD IBS PMU
- S390 cpum_cf PMU
- PowerPC trace_imc PMU
*Current design:*
Currently we have a per task and per cpu perf_event_contexts:
task_struct::perf_events_ctxp[] <-> perf_event_context <-> perf_cpu_context
^ | ^ | ^
`---------------------------------' | `--> pmu ---'
v ^
perf_event ------'
Each task has an array of pointers to a perf_event_context. Each
perf_event_context has a direct relation to a PMU and a group of
events for that PMU. The task related perf_event_context's have a
pointer back to that task.
Each PMU has a per-cpu pointer to a per-cpu perf_cpu_context, which
includes a perf_event_context, which again has a direct relation to
that PMU, and a group of events for that PMU.
The perf_cpu_context also tracks which task context is currently
associated with that CPU and includes a few other things like the
hrtimer for rotation etc.
Each perf_event is then associated with its PMU and one
perf_event_context.
*Proposed design:*
New design proposed by this patch reduce to a single task context and
a single CPU context but adds some intermediate data-structures:
task_struct::perf_event_ctxp -> perf_event_context <- perf_cpu_context
^ | ^ ^
`---------------------------' | |
| | perf_cpu_pmu_context <--.
| `----. ^ |
| | | |
| v v |
| ,--> perf_event_pmu_context |
| | |
| | |
v v |
perf_event ---> pmu ----------------'
With the new design, perf_event_context will hold all events for all
pmus in the (respective pinned/flexible) rbtrees. This can be achieved
by adding pmu to rbtree key:
{cpu, pmu, cgroup, group_index}
Each perf_event_context carries a list of perf_event_pmu_context which
is used to hold per-pmu-per-context state. For example, it keeps track
of currently active events for that pmu, a pmu specific task_ctx_data,
a flag to tell whether rotation is required or not etc.
Additionally, perf_cpu_pmu_context is used to hold per-pmu-per-cpu
state like hrtimer details to drive the event rotation, a pointer to
perf_event_pmu_context of currently running task and some other
ancillary information.
Each perf_event is associated to it's pmu, perf_event_context and
perf_event_pmu_context.
Further optimizations to current implementation are possible. For
example, ctx_resched() can be optimized to reschedule only single pmu
events.
Much thanks to Ravi for picking this up and pushing it towards
completion.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Co-developed-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20221008062424.313-1-ravi.bangoria@amd.com
2022-10-08 14:24:24 +08:00
|
|
|
DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter);
|
|
|
|
|
2021-03-10 01:10:19 +08:00
|
|
|
/*
|
|
|
|
* This one is magic, it will get called even when PMU init fails (because
|
|
|
|
* there is no PMU), in which case it should simply return NULL.
|
|
|
|
*/
|
|
|
|
DEFINE_STATIC_CALL_RET0(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs);
|
2021-01-25 20:14:58 +08:00
|
|
|
|
2011-08-31 07:41:05 +08:00
|
|
|
u64 __read_mostly hw_cache_event_ids
|
perf_counter: Implement generalized cache event types
Extend generic event enumeration with the PERF_TYPE_HW_CACHE
method.
This is a 3-dimensional space:
{ L1-D, L1-I, L2, ITLB, DTLB, BPU } x
{ load, store, prefetch } x
{ accesses, misses }
User-space passes in the 3 coordinates and the kernel provides
a counter. (if the hardware supports that type and if the
combination makes sense.)
Combinations that make no sense produce a -EINVAL.
Combinations that are not supported by the hardware produce -ENOTSUP.
Extend the tools to deal with this, and rewrite the event symbol
parsing code with various popular aliases for the units and
access methods above. So 'l1-cache-miss' and 'l1d-read-ops' are
both valid aliases.
( x86 is supported for now, with the Nehalem event table filled in,
and with Core2 and Atom having placeholder tables. )
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-06-06 02:22:46 +08:00
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX];
|
2011-08-31 07:41:05 +08:00
|
|
|
u64 __read_mostly hw_cache_extra_regs
|
2011-03-03 10:34:48 +08:00
|
|
|
[PERF_COUNT_HW_CACHE_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_OP_MAX]
|
|
|
|
[PERF_COUNT_HW_CACHE_RESULT_MAX];
|
perf_counter: Implement generalized cache event types
Extend generic event enumeration with the PERF_TYPE_HW_CACHE
method.
This is a 3-dimensional space:
{ L1-D, L1-I, L2, ITLB, DTLB, BPU } x
{ load, store, prefetch } x
{ accesses, misses }
User-space passes in the 3 coordinates and the kernel provides
a counter. (if the hardware supports that type and if the
combination makes sense.)
Combinations that make no sense produce a -EINVAL.
Combinations that are not supported by the hardware produce -ENOTSUP.
Extend the tools to deal with this, and rewrite the event symbol
parsing code with various popular aliases for the units and
access methods above. So 'l1-cache-miss' and 'l1d-read-ops' are
both valid aliases.
( x86 is supported for now, with the Nehalem event table filled in,
and with Core2 and Atom having placeholder tables. )
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-06-06 02:22:46 +08:00
|
|
|
|
2008-12-13 16:00:03 +08:00
|
|
|
/*
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
* Propagate event elapsed time into the generic event.
|
|
|
|
* Can only be executed on the CPU where the event is active.
|
2008-12-13 16:00:03 +08:00
|
|
|
* Returns the delta events processed.
|
|
|
|
*/
|
2011-08-31 07:41:05 +08:00
|
|
|
u64 x86_perf_event_update(struct perf_event *event)
|
2008-12-13 16:00:03 +08:00
|
|
|
{
|
2010-03-03 03:18:39 +08:00
|
|
|
struct hw_perf_event *hwc = &event->hw;
|
2010-03-30 00:36:50 +08:00
|
|
|
int shift = 64 - x86_pmu.cntval_bits;
|
2009-05-13 15:45:19 +08:00
|
|
|
u64 prev_raw_count, new_raw_count;
|
2016-11-30 04:33:28 +08:00
|
|
|
u64 delta;
|
2008-12-13 16:00:03 +08:00
|
|
|
|
2020-06-13 16:09:47 +08:00
|
|
|
if (unlikely(!hwc->event_base))
|
2009-07-21 21:56:48 +08:00
|
|
|
return 0;
|
|
|
|
|
2008-12-13 16:00:03 +08:00
|
|
|
/*
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
* Careful: an NMI might modify the previous event value.
|
2008-12-13 16:00:03 +08:00
|
|
|
*
|
|
|
|
* Our tactic to handle this is to first atomically read and
|
|
|
|
* exchange a new raw count - then add that new-prev delta
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
* count to the generic event atomically:
|
2008-12-13 16:00:03 +08:00
|
|
|
*/
|
|
|
|
again:
|
2010-05-21 20:43:08 +08:00
|
|
|
prev_raw_count = local64_read(&hwc->prev_count);
|
2012-03-02 06:28:14 +08:00
|
|
|
rdpmcl(hwc->event_base_rdpmc, new_raw_count);
|
2008-12-13 16:00:03 +08:00
|
|
|
|
2010-05-21 20:43:08 +08:00
|
|
|
if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
|
2008-12-13 16:00:03 +08:00
|
|
|
new_raw_count) != prev_raw_count)
|
|
|
|
goto again;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Now we have the new raw value and have updated the prev
|
|
|
|
* timestamp already. We can now calculate the elapsed delta
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
* (event-)time and add that to the generic event.
|
2008-12-13 16:00:03 +08:00
|
|
|
*
|
|
|
|
* Careful, not all hw sign-extends above the physical width
|
2009-05-13 15:45:19 +08:00
|
|
|
* of the count.
|
2008-12-13 16:00:03 +08:00
|
|
|
*/
|
2009-05-13 15:45:19 +08:00
|
|
|
delta = (new_raw_count << shift) - (prev_raw_count << shift);
|
|
|
|
delta >>= shift;
|
2008-12-13 16:00:03 +08:00
|
|
|
|
2010-05-21 20:43:08 +08:00
|
|
|
local64_add(delta, &event->count);
|
|
|
|
local64_sub(delta, &hwc->period_left);
|
2009-04-29 18:47:22 +08:00
|
|
|
|
|
|
|
return new_raw_count;
|
2008-12-13 16:00:03 +08:00
|
|
|
}
|
|
|
|
|
2011-03-03 10:34:47 +08:00
|
|
|
/*
|
|
|
|
* Find and validate any extra registers to set up.
|
|
|
|
*/
|
|
|
|
static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
|
|
|
|
{
|
2021-04-12 22:30:50 +08:00
|
|
|
struct extra_reg *extra_regs = hybrid(event->pmu, extra_regs);
|
2011-06-06 22:57:03 +08:00
|
|
|
struct hw_perf_event_extra *reg;
|
2011-03-03 10:34:47 +08:00
|
|
|
struct extra_reg *er;
|
|
|
|
|
2011-06-06 22:57:03 +08:00
|
|
|
reg = &event->hw.extra_reg;
|
2011-03-03 10:34:47 +08:00
|
|
|
|
2021-04-12 22:30:50 +08:00
|
|
|
if (!extra_regs)
|
2011-03-03 10:34:47 +08:00
|
|
|
return 0;
|
|
|
|
|
2021-04-12 22:30:50 +08:00
|
|
|
for (er = extra_regs; er->msr; er++) {
|
2011-03-03 10:34:47 +08:00
|
|
|
if (er->event != (config & er->config_mask))
|
|
|
|
continue;
|
|
|
|
if (event->attr.config1 & ~er->valid_mask)
|
|
|
|
return -EINVAL;
|
2014-07-15 03:25:56 +08:00
|
|
|
/* Check if the extra msrs can be safely accessed*/
|
|
|
|
if (!er->extra_msr_access)
|
|
|
|
return -ENXIO;
|
2011-06-06 22:57:03 +08:00
|
|
|
|
|
|
|
reg->idx = er->idx;
|
|
|
|
reg->config = event->attr.config1;
|
|
|
|
reg->reg = er->msr;
|
2011-03-03 10:34:47 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
static atomic_t active_events;
|
2015-06-09 18:03:26 +08:00
|
|
|
static atomic_t pmc_refcount;
|
2009-03-31 01:07:16 +08:00
|
|
|
static DEFINE_MUTEX(pmc_reserve_mutex);
|
|
|
|
|
2010-03-17 19:49:10 +08:00
|
|
|
#ifdef CONFIG_X86_LOCAL_APIC
|
|
|
|
|
2021-04-12 22:30:46 +08:00
|
|
|
static inline int get_possible_num_counters(void)
|
|
|
|
{
|
|
|
|
int i, num_counters = x86_pmu.num_counters;
|
|
|
|
|
|
|
|
if (!is_hybrid())
|
|
|
|
return num_counters;
|
|
|
|
|
|
|
|
for (i = 0; i < x86_pmu.num_hybrid_pmus; i++)
|
|
|
|
num_counters = max_t(int, num_counters, x86_pmu.hybrid_pmu[i].num_counters);
|
|
|
|
|
|
|
|
return num_counters;
|
|
|
|
}
|
|
|
|
|
2009-03-31 01:07:16 +08:00
|
|
|
static bool reserve_pmc_hardware(void)
|
|
|
|
{
|
2021-04-12 22:30:46 +08:00
|
|
|
int i, num_counters = get_possible_num_counters();
|
2009-03-31 01:07:16 +08:00
|
|
|
|
2021-04-12 22:30:46 +08:00
|
|
|
for (i = 0; i < num_counters; i++) {
|
2011-02-03 00:40:57 +08:00
|
|
|
if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
|
2009-03-31 01:07:16 +08:00
|
|
|
goto perfctr_fail;
|
|
|
|
}
|
|
|
|
|
2021-04-12 22:30:46 +08:00
|
|
|
for (i = 0; i < num_counters; i++) {
|
2011-02-03 00:40:57 +08:00
|
|
|
if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
|
2009-03-31 01:07:16 +08:00
|
|
|
goto eventsel_fail;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
eventsel_fail:
|
|
|
|
for (i--; i >= 0; i--)
|
2011-02-03 00:40:57 +08:00
|
|
|
release_evntsel_nmi(x86_pmu_config_addr(i));
|
2009-03-31 01:07:16 +08:00
|
|
|
|
2021-04-12 22:30:46 +08:00
|
|
|
i = num_counters;
|
2009-03-31 01:07:16 +08:00
|
|
|
|
|
|
|
perfctr_fail:
|
|
|
|
for (i--; i >= 0; i--)
|
2011-02-03 00:40:57 +08:00
|
|
|
release_perfctr_nmi(x86_pmu_event_addr(i));
|
2009-03-31 01:07:16 +08:00
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void release_pmc_hardware(void)
|
|
|
|
{
|
2021-04-12 22:30:46 +08:00
|
|
|
int i, num_counters = get_possible_num_counters();
|
2009-03-31 01:07:16 +08:00
|
|
|
|
2021-04-12 22:30:46 +08:00
|
|
|
for (i = 0; i < num_counters; i++) {
|
2011-02-03 00:40:57 +08:00
|
|
|
release_perfctr_nmi(x86_pmu_event_addr(i));
|
|
|
|
release_evntsel_nmi(x86_pmu_config_addr(i));
|
2009-03-31 01:07:16 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-03-17 19:49:10 +08:00
|
|
|
#else
|
|
|
|
|
|
|
|
static bool reserve_pmc_hardware(void) { return true; }
|
|
|
|
static void release_pmc_hardware(void) {}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
2021-04-12 22:30:45 +08:00
|
|
|
bool check_hw_exists(struct pmu *pmu, int num_counters, int num_counters_fixed)
|
2010-11-23 05:55:23 +08:00
|
|
|
{
|
2017-07-19 20:52:59 +08:00
|
|
|
u64 val, val_fail = -1, val_new= ~0;
|
|
|
|
int i, reg, reg_fail = -1, ret = 0;
|
perf/x86: Check all MSRs before passing hw check
check_hw_exists() has a number of checks which go to two exit
paths: msr_fail and bios_fail. Checks classified as msr_fail
will cause check_hw_exists() to return false, causing the PMU
not to be used; bios_fail checks will only cause a warning to be
printed, but will return true.
The problem is that if there are both msr failures and bios
failures, and the routine hits a bios_fail check first, it will
exit early and return true, not finishing the rest of the msr
checks. If those msrs are in fact broken, it will cause them to
be used erroneously.
In the case of a Xen PV VM, the guest OS has read access to all
the MSRs, but write access is white-listed to supported
features. Writes to unsupported MSRs have no effect. The PMU
MSRs are not (typically) supported, because they are expensive
to save and restore on a VM context switch. One of the
"msr_fail" checks is supposed to detect this circumstance (ether
for Xen or KVM) and disable the harware PMU.
However, on one of my AMD boxen, there is (apparently) a broken
BIOS which triggers one of the bios_fail checks. In particular,
MSR_K7_EVNTSEL0 has the ARCH_PERFMON_EVENTSEL_ENABLE bit set.
The guest kernel detects this because it has read access to all
MSRs, and causes it to skip the rest of the checks and try to
use the non-existent hardware PMU. This minimally causes a lot
of useless instruction emulation and Xen console spam; it may
cause other issues with the watchdog as well.
This changset causes check_hw_exists() to go through all of the
msr checks, failing and returning false if any of them fail.
This makes sure that a guest running under Xen without a virtual
PMU will detect that there is no functioning PMU and not attempt
to use it.
This problem affects kernels as far back as 3.2, and should thus
be considered for backport.
Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>
Cc: Konrad Wilk <konrad.wilk@oracle.com>
Cc: Ian Campbell <ian.campbell@citrix.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Link: http://lkml.kernel.org/r/1365000388-32448-1-git-send-email-george.dunlap@eu.citrix.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-04-03 22:46:28 +08:00
|
|
|
int bios_fail = 0;
|
perf/x86: Tweak broken BIOS rules during check_hw_exists()
I stumbled upon an AMD box that had the BIOS using a hardware performance
counter. Instead of printing out a warning and continuing, it failed and
blocked further perf counter usage.
Looking through the history, I found this commit:
a5ebe0ba3dff ("perf/x86: Check all MSRs before passing hw check")
which tweaked the rules for a Xen guest on an almost identical box and now
changed the behaviour.
Unfortunately the rules were tweaked incorrectly and will always lead to
MSR failures even though the MSRs are completely fine.
What happens now is in arch/x86/kernel/cpu/perf_event.c::check_hw_exists():
<snip>
for (i = 0; i < x86_pmu.num_counters; i++) {
reg = x86_pmu_config_addr(i);
ret = rdmsrl_safe(reg, &val);
if (ret)
goto msr_fail;
if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
bios_fail = 1;
val_fail = val;
reg_fail = reg;
}
}
<snip>
/*
* Read the current value, change it and read it back to see if it
* matches, this is needed to detect certain hardware emulators
* (qemu/kvm) that don't trap on the MSR access and always return 0s.
*/
reg = x86_pmu_event_addr(0);
^^^^
if the first perf counter is enabled, then this routine will always fail
because the counter is running. :-(
if (rdmsrl_safe(reg, &val))
goto msr_fail;
val ^= 0xffffUL;
ret = wrmsrl_safe(reg, val);
ret |= rdmsrl_safe(reg, &val_new);
if (ret || val != val_new)
goto msr_fail;
The above bios_fail used to be a 'goto' which is why it worked in the past.
Further, most vendors have migrated to using fixed counters to hide their
evilness hence this problem rarely shows up now days except on a few old boxes.
I fixed my problem and kept the spirit of the original Xen fix, by recording a
safe non-enable register to be used safely for the reading/writing check.
Because it is not enabled, this passes on bare metal boxes (like metal), but
should continue to throw an msr_fail on Xen guests because the register isn't
emulated yet.
Now I get a proper bios_fail error message and Xen should still see their
msr_fail message (untested).
Signed-off-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: george.dunlap@eu.citrix.com
Cc: konrad.wilk@oracle.com
Link: http://lkml.kernel.org/r/1431976608-56970-1-git-send-email-dzickus@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-05-19 03:16:48 +08:00
|
|
|
int reg_safe = -1;
|
2010-11-23 05:55:23 +08:00
|
|
|
|
2010-12-08 22:56:23 +08:00
|
|
|
/*
|
|
|
|
* Check to see if the BIOS enabled any of the counters, if so
|
|
|
|
* complain and bail.
|
|
|
|
*/
|
2021-04-12 22:30:45 +08:00
|
|
|
for (i = 0; i < num_counters; i++) {
|
2011-02-03 00:40:57 +08:00
|
|
|
reg = x86_pmu_config_addr(i);
|
2010-12-08 22:56:23 +08:00
|
|
|
ret = rdmsrl_safe(reg, &val);
|
|
|
|
if (ret)
|
|
|
|
goto msr_fail;
|
perf/x86: Check all MSRs before passing hw check
check_hw_exists() has a number of checks which go to two exit
paths: msr_fail and bios_fail. Checks classified as msr_fail
will cause check_hw_exists() to return false, causing the PMU
not to be used; bios_fail checks will only cause a warning to be
printed, but will return true.
The problem is that if there are both msr failures and bios
failures, and the routine hits a bios_fail check first, it will
exit early and return true, not finishing the rest of the msr
checks. If those msrs are in fact broken, it will cause them to
be used erroneously.
In the case of a Xen PV VM, the guest OS has read access to all
the MSRs, but write access is white-listed to supported
features. Writes to unsupported MSRs have no effect. The PMU
MSRs are not (typically) supported, because they are expensive
to save and restore on a VM context switch. One of the
"msr_fail" checks is supposed to detect this circumstance (ether
for Xen or KVM) and disable the harware PMU.
However, on one of my AMD boxen, there is (apparently) a broken
BIOS which triggers one of the bios_fail checks. In particular,
MSR_K7_EVNTSEL0 has the ARCH_PERFMON_EVENTSEL_ENABLE bit set.
The guest kernel detects this because it has read access to all
MSRs, and causes it to skip the rest of the checks and try to
use the non-existent hardware PMU. This minimally causes a lot
of useless instruction emulation and Xen console spam; it may
cause other issues with the watchdog as well.
This changset causes check_hw_exists() to go through all of the
msr checks, failing and returning false if any of them fail.
This makes sure that a guest running under Xen without a virtual
PMU will detect that there is no functioning PMU and not attempt
to use it.
This problem affects kernels as far back as 3.2, and should thus
be considered for backport.
Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>
Cc: Konrad Wilk <konrad.wilk@oracle.com>
Cc: Ian Campbell <ian.campbell@citrix.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Link: http://lkml.kernel.org/r/1365000388-32448-1-git-send-email-george.dunlap@eu.citrix.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-04-03 22:46:28 +08:00
|
|
|
if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
|
|
|
|
bios_fail = 1;
|
|
|
|
val_fail = val;
|
|
|
|
reg_fail = reg;
|
perf/x86: Tweak broken BIOS rules during check_hw_exists()
I stumbled upon an AMD box that had the BIOS using a hardware performance
counter. Instead of printing out a warning and continuing, it failed and
blocked further perf counter usage.
Looking through the history, I found this commit:
a5ebe0ba3dff ("perf/x86: Check all MSRs before passing hw check")
which tweaked the rules for a Xen guest on an almost identical box and now
changed the behaviour.
Unfortunately the rules were tweaked incorrectly and will always lead to
MSR failures even though the MSRs are completely fine.
What happens now is in arch/x86/kernel/cpu/perf_event.c::check_hw_exists():
<snip>
for (i = 0; i < x86_pmu.num_counters; i++) {
reg = x86_pmu_config_addr(i);
ret = rdmsrl_safe(reg, &val);
if (ret)
goto msr_fail;
if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
bios_fail = 1;
val_fail = val;
reg_fail = reg;
}
}
<snip>
/*
* Read the current value, change it and read it back to see if it
* matches, this is needed to detect certain hardware emulators
* (qemu/kvm) that don't trap on the MSR access and always return 0s.
*/
reg = x86_pmu_event_addr(0);
^^^^
if the first perf counter is enabled, then this routine will always fail
because the counter is running. :-(
if (rdmsrl_safe(reg, &val))
goto msr_fail;
val ^= 0xffffUL;
ret = wrmsrl_safe(reg, val);
ret |= rdmsrl_safe(reg, &val_new);
if (ret || val != val_new)
goto msr_fail;
The above bios_fail used to be a 'goto' which is why it worked in the past.
Further, most vendors have migrated to using fixed counters to hide their
evilness hence this problem rarely shows up now days except on a few old boxes.
I fixed my problem and kept the spirit of the original Xen fix, by recording a
safe non-enable register to be used safely for the reading/writing check.
Because it is not enabled, this passes on bare metal boxes (like metal), but
should continue to throw an msr_fail on Xen guests because the register isn't
emulated yet.
Now I get a proper bios_fail error message and Xen should still see their
msr_fail message (untested).
Signed-off-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: george.dunlap@eu.citrix.com
Cc: konrad.wilk@oracle.com
Link: http://lkml.kernel.org/r/1431976608-56970-1-git-send-email-dzickus@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-05-19 03:16:48 +08:00
|
|
|
} else {
|
|
|
|
reg_safe = i;
|
perf/x86: Check all MSRs before passing hw check
check_hw_exists() has a number of checks which go to two exit
paths: msr_fail and bios_fail. Checks classified as msr_fail
will cause check_hw_exists() to return false, causing the PMU
not to be used; bios_fail checks will only cause a warning to be
printed, but will return true.
The problem is that if there are both msr failures and bios
failures, and the routine hits a bios_fail check first, it will
exit early and return true, not finishing the rest of the msr
checks. If those msrs are in fact broken, it will cause them to
be used erroneously.
In the case of a Xen PV VM, the guest OS has read access to all
the MSRs, but write access is white-listed to supported
features. Writes to unsupported MSRs have no effect. The PMU
MSRs are not (typically) supported, because they are expensive
to save and restore on a VM context switch. One of the
"msr_fail" checks is supposed to detect this circumstance (ether
for Xen or KVM) and disable the harware PMU.
However, on one of my AMD boxen, there is (apparently) a broken
BIOS which triggers one of the bios_fail checks. In particular,
MSR_K7_EVNTSEL0 has the ARCH_PERFMON_EVENTSEL_ENABLE bit set.
The guest kernel detects this because it has read access to all
MSRs, and causes it to skip the rest of the checks and try to
use the non-existent hardware PMU. This minimally causes a lot
of useless instruction emulation and Xen console spam; it may
cause other issues with the watchdog as well.
This changset causes check_hw_exists() to go through all of the
msr checks, failing and returning false if any of them fail.
This makes sure that a guest running under Xen without a virtual
PMU will detect that there is no functioning PMU and not attempt
to use it.
This problem affects kernels as far back as 3.2, and should thus
be considered for backport.
Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>
Cc: Konrad Wilk <konrad.wilk@oracle.com>
Cc: Ian Campbell <ian.campbell@citrix.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Link: http://lkml.kernel.org/r/1365000388-32448-1-git-send-email-george.dunlap@eu.citrix.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-04-03 22:46:28 +08:00
|
|
|
}
|
2010-12-08 22:56:23 +08:00
|
|
|
}
|
|
|
|
|
2021-04-12 22:30:45 +08:00
|
|
|
if (num_counters_fixed) {
|
2010-12-08 22:56:23 +08:00
|
|
|
reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
|
|
|
|
ret = rdmsrl_safe(reg, &val);
|
|
|
|
if (ret)
|
|
|
|
goto msr_fail;
|
2021-04-12 22:30:45 +08:00
|
|
|
for (i = 0; i < num_counters_fixed; i++) {
|
|
|
|
if (fixed_counter_disabled(i, pmu))
|
2021-01-29 06:40:11 +08:00
|
|
|
continue;
|
2021-04-20 22:29:07 +08:00
|
|
|
if (val & (0x03ULL << i*4)) {
|
perf/x86: Check all MSRs before passing hw check
check_hw_exists() has a number of checks which go to two exit
paths: msr_fail and bios_fail. Checks classified as msr_fail
will cause check_hw_exists() to return false, causing the PMU
not to be used; bios_fail checks will only cause a warning to be
printed, but will return true.
The problem is that if there are both msr failures and bios
failures, and the routine hits a bios_fail check first, it will
exit early and return true, not finishing the rest of the msr
checks. If those msrs are in fact broken, it will cause them to
be used erroneously.
In the case of a Xen PV VM, the guest OS has read access to all
the MSRs, but write access is white-listed to supported
features. Writes to unsupported MSRs have no effect. The PMU
MSRs are not (typically) supported, because they are expensive
to save and restore on a VM context switch. One of the
"msr_fail" checks is supposed to detect this circumstance (ether
for Xen or KVM) and disable the harware PMU.
However, on one of my AMD boxen, there is (apparently) a broken
BIOS which triggers one of the bios_fail checks. In particular,
MSR_K7_EVNTSEL0 has the ARCH_PERFMON_EVENTSEL_ENABLE bit set.
The guest kernel detects this because it has read access to all
MSRs, and causes it to skip the rest of the checks and try to
use the non-existent hardware PMU. This minimally causes a lot
of useless instruction emulation and Xen console spam; it may
cause other issues with the watchdog as well.
This changset causes check_hw_exists() to go through all of the
msr checks, failing and returning false if any of them fail.
This makes sure that a guest running under Xen without a virtual
PMU will detect that there is no functioning PMU and not attempt
to use it.
This problem affects kernels as far back as 3.2, and should thus
be considered for backport.
Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>
Cc: Konrad Wilk <konrad.wilk@oracle.com>
Cc: Ian Campbell <ian.campbell@citrix.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Link: http://lkml.kernel.org/r/1365000388-32448-1-git-send-email-george.dunlap@eu.citrix.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-04-03 22:46:28 +08:00
|
|
|
bios_fail = 1;
|
|
|
|
val_fail = val;
|
|
|
|
reg_fail = reg;
|
|
|
|
}
|
2010-12-08 22:56:23 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
perf/x86: Tweak broken BIOS rules during check_hw_exists()
I stumbled upon an AMD box that had the BIOS using a hardware performance
counter. Instead of printing out a warning and continuing, it failed and
blocked further perf counter usage.
Looking through the history, I found this commit:
a5ebe0ba3dff ("perf/x86: Check all MSRs before passing hw check")
which tweaked the rules for a Xen guest on an almost identical box and now
changed the behaviour.
Unfortunately the rules were tweaked incorrectly and will always lead to
MSR failures even though the MSRs are completely fine.
What happens now is in arch/x86/kernel/cpu/perf_event.c::check_hw_exists():
<snip>
for (i = 0; i < x86_pmu.num_counters; i++) {
reg = x86_pmu_config_addr(i);
ret = rdmsrl_safe(reg, &val);
if (ret)
goto msr_fail;
if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
bios_fail = 1;
val_fail = val;
reg_fail = reg;
}
}
<snip>
/*
* Read the current value, change it and read it back to see if it
* matches, this is needed to detect certain hardware emulators
* (qemu/kvm) that don't trap on the MSR access and always return 0s.
*/
reg = x86_pmu_event_addr(0);
^^^^
if the first perf counter is enabled, then this routine will always fail
because the counter is running. :-(
if (rdmsrl_safe(reg, &val))
goto msr_fail;
val ^= 0xffffUL;
ret = wrmsrl_safe(reg, val);
ret |= rdmsrl_safe(reg, &val_new);
if (ret || val != val_new)
goto msr_fail;
The above bios_fail used to be a 'goto' which is why it worked in the past.
Further, most vendors have migrated to using fixed counters to hide their
evilness hence this problem rarely shows up now days except on a few old boxes.
I fixed my problem and kept the spirit of the original Xen fix, by recording a
safe non-enable register to be used safely for the reading/writing check.
Because it is not enabled, this passes on bare metal boxes (like metal), but
should continue to throw an msr_fail on Xen guests because the register isn't
emulated yet.
Now I get a proper bios_fail error message and Xen should still see their
msr_fail message (untested).
Signed-off-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: george.dunlap@eu.citrix.com
Cc: konrad.wilk@oracle.com
Link: http://lkml.kernel.org/r/1431976608-56970-1-git-send-email-dzickus@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-05-19 03:16:48 +08:00
|
|
|
/*
|
|
|
|
* If all the counters are enabled, the below test will always
|
|
|
|
* fail. The tools will also become useless in this scenario.
|
|
|
|
* Just fail and disable the hardware counters.
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (reg_safe == -1) {
|
|
|
|
reg = reg_safe;
|
|
|
|
goto msr_fail;
|
|
|
|
}
|
|
|
|
|
2010-12-08 22:56:23 +08:00
|
|
|
/*
|
2012-10-09 23:38:35 +08:00
|
|
|
* Read the current value, change it and read it back to see if it
|
|
|
|
* matches, this is needed to detect certain hardware emulators
|
|
|
|
* (qemu/kvm) that don't trap on the MSR access and always return 0s.
|
2010-12-08 22:56:23 +08:00
|
|
|
*/
|
perf/x86: Tweak broken BIOS rules during check_hw_exists()
I stumbled upon an AMD box that had the BIOS using a hardware performance
counter. Instead of printing out a warning and continuing, it failed and
blocked further perf counter usage.
Looking through the history, I found this commit:
a5ebe0ba3dff ("perf/x86: Check all MSRs before passing hw check")
which tweaked the rules for a Xen guest on an almost identical box and now
changed the behaviour.
Unfortunately the rules were tweaked incorrectly and will always lead to
MSR failures even though the MSRs are completely fine.
What happens now is in arch/x86/kernel/cpu/perf_event.c::check_hw_exists():
<snip>
for (i = 0; i < x86_pmu.num_counters; i++) {
reg = x86_pmu_config_addr(i);
ret = rdmsrl_safe(reg, &val);
if (ret)
goto msr_fail;
if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
bios_fail = 1;
val_fail = val;
reg_fail = reg;
}
}
<snip>
/*
* Read the current value, change it and read it back to see if it
* matches, this is needed to detect certain hardware emulators
* (qemu/kvm) that don't trap on the MSR access and always return 0s.
*/
reg = x86_pmu_event_addr(0);
^^^^
if the first perf counter is enabled, then this routine will always fail
because the counter is running. :-(
if (rdmsrl_safe(reg, &val))
goto msr_fail;
val ^= 0xffffUL;
ret = wrmsrl_safe(reg, val);
ret |= rdmsrl_safe(reg, &val_new);
if (ret || val != val_new)
goto msr_fail;
The above bios_fail used to be a 'goto' which is why it worked in the past.
Further, most vendors have migrated to using fixed counters to hide their
evilness hence this problem rarely shows up now days except on a few old boxes.
I fixed my problem and kept the spirit of the original Xen fix, by recording a
safe non-enable register to be used safely for the reading/writing check.
Because it is not enabled, this passes on bare metal boxes (like metal), but
should continue to throw an msr_fail on Xen guests because the register isn't
emulated yet.
Now I get a proper bios_fail error message and Xen should still see their
msr_fail message (untested).
Signed-off-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: george.dunlap@eu.citrix.com
Cc: konrad.wilk@oracle.com
Link: http://lkml.kernel.org/r/1431976608-56970-1-git-send-email-dzickus@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-05-19 03:16:48 +08:00
|
|
|
reg = x86_pmu_event_addr(reg_safe);
|
2012-10-09 23:38:35 +08:00
|
|
|
if (rdmsrl_safe(reg, &val))
|
|
|
|
goto msr_fail;
|
|
|
|
val ^= 0xffffUL;
|
2012-06-21 02:46:36 +08:00
|
|
|
ret = wrmsrl_safe(reg, val);
|
|
|
|
ret |= rdmsrl_safe(reg, &val_new);
|
2010-11-23 05:55:23 +08:00
|
|
|
if (ret || val != val_new)
|
2010-12-08 22:56:23 +08:00
|
|
|
goto msr_fail;
|
2010-11-23 05:55:23 +08:00
|
|
|
|
2011-03-25 17:24:23 +08:00
|
|
|
/*
|
|
|
|
* We still allow the PMU driver to operate:
|
|
|
|
*/
|
perf/x86: Check all MSRs before passing hw check
check_hw_exists() has a number of checks which go to two exit
paths: msr_fail and bios_fail. Checks classified as msr_fail
will cause check_hw_exists() to return false, causing the PMU
not to be used; bios_fail checks will only cause a warning to be
printed, but will return true.
The problem is that if there are both msr failures and bios
failures, and the routine hits a bios_fail check first, it will
exit early and return true, not finishing the rest of the msr
checks. If those msrs are in fact broken, it will cause them to
be used erroneously.
In the case of a Xen PV VM, the guest OS has read access to all
the MSRs, but write access is white-listed to supported
features. Writes to unsupported MSRs have no effect. The PMU
MSRs are not (typically) supported, because they are expensive
to save and restore on a VM context switch. One of the
"msr_fail" checks is supposed to detect this circumstance (ether
for Xen or KVM) and disable the harware PMU.
However, on one of my AMD boxen, there is (apparently) a broken
BIOS which triggers one of the bios_fail checks. In particular,
MSR_K7_EVNTSEL0 has the ARCH_PERFMON_EVENTSEL_ENABLE bit set.
The guest kernel detects this because it has read access to all
MSRs, and causes it to skip the rest of the checks and try to
use the non-existent hardware PMU. This minimally causes a lot
of useless instruction emulation and Xen console spam; it may
cause other issues with the watchdog as well.
This changset causes check_hw_exists() to go through all of the
msr checks, failing and returning false if any of them fail.
This makes sure that a guest running under Xen without a virtual
PMU will detect that there is no functioning PMU and not attempt
to use it.
This problem affects kernels as far back as 3.2, and should thus
be considered for backport.
Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>
Cc: Konrad Wilk <konrad.wilk@oracle.com>
Cc: Ian Campbell <ian.campbell@citrix.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Link: http://lkml.kernel.org/r/1365000388-32448-1-git-send-email-george.dunlap@eu.citrix.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-04-03 22:46:28 +08:00
|
|
|
if (bios_fail) {
|
2016-02-02 11:45:02 +08:00
|
|
|
pr_cont("Broken BIOS detected, complain to your hardware vendor.\n");
|
|
|
|
pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n",
|
|
|
|
reg_fail, val_fail);
|
perf/x86: Check all MSRs before passing hw check
check_hw_exists() has a number of checks which go to two exit
paths: msr_fail and bios_fail. Checks classified as msr_fail
will cause check_hw_exists() to return false, causing the PMU
not to be used; bios_fail checks will only cause a warning to be
printed, but will return true.
The problem is that if there are both msr failures and bios
failures, and the routine hits a bios_fail check first, it will
exit early and return true, not finishing the rest of the msr
checks. If those msrs are in fact broken, it will cause them to
be used erroneously.
In the case of a Xen PV VM, the guest OS has read access to all
the MSRs, but write access is white-listed to supported
features. Writes to unsupported MSRs have no effect. The PMU
MSRs are not (typically) supported, because they are expensive
to save and restore on a VM context switch. One of the
"msr_fail" checks is supposed to detect this circumstance (ether
for Xen or KVM) and disable the harware PMU.
However, on one of my AMD boxen, there is (apparently) a broken
BIOS which triggers one of the bios_fail checks. In particular,
MSR_K7_EVNTSEL0 has the ARCH_PERFMON_EVENTSEL_ENABLE bit set.
The guest kernel detects this because it has read access to all
MSRs, and causes it to skip the rest of the checks and try to
use the non-existent hardware PMU. This minimally causes a lot
of useless instruction emulation and Xen console spam; it may
cause other issues with the watchdog as well.
This changset causes check_hw_exists() to go through all of the
msr checks, failing and returning false if any of them fail.
This makes sure that a guest running under Xen without a virtual
PMU will detect that there is no functioning PMU and not attempt
to use it.
This problem affects kernels as far back as 3.2, and should thus
be considered for backport.
Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>
Cc: Konrad Wilk <konrad.wilk@oracle.com>
Cc: Ian Campbell <ian.campbell@citrix.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Link: http://lkml.kernel.org/r/1365000388-32448-1-git-send-email-george.dunlap@eu.citrix.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-04-03 22:46:28 +08:00
|
|
|
}
|
2011-03-25 17:24:23 +08:00
|
|
|
|
|
|
|
return true;
|
2010-12-08 22:56:23 +08:00
|
|
|
|
|
|
|
msr_fail:
|
2016-08-01 19:37:07 +08:00
|
|
|
if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
|
|
|
|
pr_cont("PMU not available due to virtualization, using software events only.\n");
|
|
|
|
} else {
|
|
|
|
pr_cont("Broken PMU hardware detected, using software events only.\n");
|
|
|
|
pr_err("Failed to access perfctr msr (MSR %x is %Lx)\n",
|
|
|
|
reg, val_new);
|
|
|
|
}
|
2011-03-25 17:24:23 +08:00
|
|
|
|
2010-12-08 22:56:23 +08:00
|
|
|
return false;
|
2010-11-23 05:55:23 +08:00
|
|
|
}
|
|
|
|
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
static void hw_perf_event_destroy(struct perf_event *event)
|
2009-03-31 01:07:16 +08:00
|
|
|
{
|
2015-06-11 20:13:56 +08:00
|
|
|
x86_release_hardware();
|
2015-06-09 18:03:26 +08:00
|
|
|
atomic_dec(&active_events);
|
2009-03-31 01:07:16 +08:00
|
|
|
}
|
|
|
|
|
2015-01-14 20:18:20 +08:00
|
|
|
void hw_perf_lbr_event_destroy(struct perf_event *event)
|
|
|
|
{
|
|
|
|
hw_perf_event_destroy(event);
|
|
|
|
|
|
|
|
/* undo the lbr/bts event accounting */
|
|
|
|
x86_del_exclusive(x86_lbr_exclusive_lbr);
|
|
|
|
}
|
|
|
|
|
2009-04-29 18:47:20 +08:00
|
|
|
static inline int x86_pmu_initialized(void)
|
|
|
|
{
|
|
|
|
return x86_pmu.handle_irq != NULL;
|
|
|
|
}
|
|
|
|
|
perf_counter: Implement generalized cache event types
Extend generic event enumeration with the PERF_TYPE_HW_CACHE
method.
This is a 3-dimensional space:
{ L1-D, L1-I, L2, ITLB, DTLB, BPU } x
{ load, store, prefetch } x
{ accesses, misses }
User-space passes in the 3 coordinates and the kernel provides
a counter. (if the hardware supports that type and if the
combination makes sense.)
Combinations that make no sense produce a -EINVAL.
Combinations that are not supported by the hardware produce -ENOTSUP.
Extend the tools to deal with this, and rewrite the event symbol
parsing code with various popular aliases for the units and
access methods above. So 'l1-cache-miss' and 'l1d-read-ops' are
both valid aliases.
( x86 is supported for now, with the Nehalem event table filled in,
and with Core2 and Atom having placeholder tables. )
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-06-06 02:22:46 +08:00
|
|
|
static inline int
|
2011-03-03 10:34:48 +08:00
|
|
|
set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
|
perf_counter: Implement generalized cache event types
Extend generic event enumeration with the PERF_TYPE_HW_CACHE
method.
This is a 3-dimensional space:
{ L1-D, L1-I, L2, ITLB, DTLB, BPU } x
{ load, store, prefetch } x
{ accesses, misses }
User-space passes in the 3 coordinates and the kernel provides
a counter. (if the hardware supports that type and if the
combination makes sense.)
Combinations that make no sense produce a -EINVAL.
Combinations that are not supported by the hardware produce -ENOTSUP.
Extend the tools to deal with this, and rewrite the event symbol
parsing code with various popular aliases for the units and
access methods above. So 'l1-cache-miss' and 'l1d-read-ops' are
both valid aliases.
( x86 is supported for now, with the Nehalem event table filled in,
and with Core2 and Atom having placeholder tables. )
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-06-06 02:22:46 +08:00
|
|
|
{
|
2011-03-03 10:34:48 +08:00
|
|
|
struct perf_event_attr *attr = &event->attr;
|
perf_counter: Implement generalized cache event types
Extend generic event enumeration with the PERF_TYPE_HW_CACHE
method.
This is a 3-dimensional space:
{ L1-D, L1-I, L2, ITLB, DTLB, BPU } x
{ load, store, prefetch } x
{ accesses, misses }
User-space passes in the 3 coordinates and the kernel provides
a counter. (if the hardware supports that type and if the
combination makes sense.)
Combinations that make no sense produce a -EINVAL.
Combinations that are not supported by the hardware produce -ENOTSUP.
Extend the tools to deal with this, and rewrite the event symbol
parsing code with various popular aliases for the units and
access methods above. So 'l1-cache-miss' and 'l1d-read-ops' are
both valid aliases.
( x86 is supported for now, with the Nehalem event table filled in,
and with Core2 and Atom having placeholder tables. )
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-06-06 02:22:46 +08:00
|
|
|
unsigned int cache_type, cache_op, cache_result;
|
|
|
|
u64 config, val;
|
|
|
|
|
|
|
|
config = attr->config;
|
|
|
|
|
2018-04-20 20:06:29 +08:00
|
|
|
cache_type = (config >> 0) & 0xff;
|
perf_counter: Implement generalized cache event types
Extend generic event enumeration with the PERF_TYPE_HW_CACHE
method.
This is a 3-dimensional space:
{ L1-D, L1-I, L2, ITLB, DTLB, BPU } x
{ load, store, prefetch } x
{ accesses, misses }
User-space passes in the 3 coordinates and the kernel provides
a counter. (if the hardware supports that type and if the
combination makes sense.)
Combinations that make no sense produce a -EINVAL.
Combinations that are not supported by the hardware produce -ENOTSUP.
Extend the tools to deal with this, and rewrite the event symbol
parsing code with various popular aliases for the units and
access methods above. So 'l1-cache-miss' and 'l1d-read-ops' are
both valid aliases.
( x86 is supported for now, with the Nehalem event table filled in,
and with Core2 and Atom having placeholder tables. )
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-06-06 02:22:46 +08:00
|
|
|
if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
|
|
|
|
return -EINVAL;
|
2018-04-20 20:06:29 +08:00
|
|
|
cache_type = array_index_nospec(cache_type, PERF_COUNT_HW_CACHE_MAX);
|
perf_counter: Implement generalized cache event types
Extend generic event enumeration with the PERF_TYPE_HW_CACHE
method.
This is a 3-dimensional space:
{ L1-D, L1-I, L2, ITLB, DTLB, BPU } x
{ load, store, prefetch } x
{ accesses, misses }
User-space passes in the 3 coordinates and the kernel provides
a counter. (if the hardware supports that type and if the
combination makes sense.)
Combinations that make no sense produce a -EINVAL.
Combinations that are not supported by the hardware produce -ENOTSUP.
Extend the tools to deal with this, and rewrite the event symbol
parsing code with various popular aliases for the units and
access methods above. So 'l1-cache-miss' and 'l1d-read-ops' are
both valid aliases.
( x86 is supported for now, with the Nehalem event table filled in,
and with Core2 and Atom having placeholder tables. )
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-06-06 02:22:46 +08:00
|
|
|
|
|
|
|
cache_op = (config >> 8) & 0xff;
|
|
|
|
if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
|
|
|
|
return -EINVAL;
|
2018-04-20 20:06:29 +08:00
|
|
|
cache_op = array_index_nospec(cache_op, PERF_COUNT_HW_CACHE_OP_MAX);
|
perf_counter: Implement generalized cache event types
Extend generic event enumeration with the PERF_TYPE_HW_CACHE
method.
This is a 3-dimensional space:
{ L1-D, L1-I, L2, ITLB, DTLB, BPU } x
{ load, store, prefetch } x
{ accesses, misses }
User-space passes in the 3 coordinates and the kernel provides
a counter. (if the hardware supports that type and if the
combination makes sense.)
Combinations that make no sense produce a -EINVAL.
Combinations that are not supported by the hardware produce -ENOTSUP.
Extend the tools to deal with this, and rewrite the event symbol
parsing code with various popular aliases for the units and
access methods above. So 'l1-cache-miss' and 'l1d-read-ops' are
both valid aliases.
( x86 is supported for now, with the Nehalem event table filled in,
and with Core2 and Atom having placeholder tables. )
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-06-06 02:22:46 +08:00
|
|
|
|
|
|
|
cache_result = (config >> 16) & 0xff;
|
|
|
|
if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
|
|
|
|
return -EINVAL;
|
2018-04-20 20:06:29 +08:00
|
|
|
cache_result = array_index_nospec(cache_result, PERF_COUNT_HW_CACHE_RESULT_MAX);
|
perf_counter: Implement generalized cache event types
Extend generic event enumeration with the PERF_TYPE_HW_CACHE
method.
This is a 3-dimensional space:
{ L1-D, L1-I, L2, ITLB, DTLB, BPU } x
{ load, store, prefetch } x
{ accesses, misses }
User-space passes in the 3 coordinates and the kernel provides
a counter. (if the hardware supports that type and if the
combination makes sense.)
Combinations that make no sense produce a -EINVAL.
Combinations that are not supported by the hardware produce -ENOTSUP.
Extend the tools to deal with this, and rewrite the event symbol
parsing code with various popular aliases for the units and
access methods above. So 'l1-cache-miss' and 'l1d-read-ops' are
both valid aliases.
( x86 is supported for now, with the Nehalem event table filled in,
and with Core2 and Atom having placeholder tables. )
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-06-06 02:22:46 +08:00
|
|
|
|
2021-04-12 22:30:48 +08:00
|
|
|
val = hybrid_var(event->pmu, hw_cache_event_ids)[cache_type][cache_op][cache_result];
|
perf_counter: Implement generalized cache event types
Extend generic event enumeration with the PERF_TYPE_HW_CACHE
method.
This is a 3-dimensional space:
{ L1-D, L1-I, L2, ITLB, DTLB, BPU } x
{ load, store, prefetch } x
{ accesses, misses }
User-space passes in the 3 coordinates and the kernel provides
a counter. (if the hardware supports that type and if the
combination makes sense.)
Combinations that make no sense produce a -EINVAL.
Combinations that are not supported by the hardware produce -ENOTSUP.
Extend the tools to deal with this, and rewrite the event symbol
parsing code with various popular aliases for the units and
access methods above. So 'l1-cache-miss' and 'l1d-read-ops' are
both valid aliases.
( x86 is supported for now, with the Nehalem event table filled in,
and with Core2 and Atom having placeholder tables. )
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-06-06 02:22:46 +08:00
|
|
|
if (val == 0)
|
|
|
|
return -ENOENT;
|
|
|
|
|
|
|
|
if (val == -1)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
hwc->config |= val;
|
2021-04-12 22:30:48 +08:00
|
|
|
attr->config1 = hybrid_var(event->pmu, hw_cache_extra_regs)[cache_type][cache_op][cache_result];
|
2011-03-03 10:34:48 +08:00
|
|
|
return x86_pmu_extra_regs(val, event);
|
perf_counter: Implement generalized cache event types
Extend generic event enumeration with the PERF_TYPE_HW_CACHE
method.
This is a 3-dimensional space:
{ L1-D, L1-I, L2, ITLB, DTLB, BPU } x
{ load, store, prefetch } x
{ accesses, misses }
User-space passes in the 3 coordinates and the kernel provides
a counter. (if the hardware supports that type and if the
combination makes sense.)
Combinations that make no sense produce a -EINVAL.
Combinations that are not supported by the hardware produce -ENOTSUP.
Extend the tools to deal with this, and rewrite the event symbol
parsing code with various popular aliases for the units and
access methods above. So 'l1-cache-miss' and 'l1d-read-ops' are
both valid aliases.
( x86 is supported for now, with the Nehalem event table filled in,
and with Core2 and Atom having placeholder tables. )
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-06-06 02:22:46 +08:00
|
|
|
}
|
|
|
|
|
2015-06-11 20:13:56 +08:00
|
|
|
int x86_reserve_hardware(void)
|
|
|
|
{
|
|
|
|
int err = 0;
|
|
|
|
|
2015-06-09 18:03:26 +08:00
|
|
|
if (!atomic_inc_not_zero(&pmc_refcount)) {
|
2015-06-11 20:13:56 +08:00
|
|
|
mutex_lock(&pmc_reserve_mutex);
|
2015-06-09 18:03:26 +08:00
|
|
|
if (atomic_read(&pmc_refcount) == 0) {
|
2021-04-30 13:22:47 +08:00
|
|
|
if (!reserve_pmc_hardware()) {
|
2015-06-11 20:13:56 +08:00
|
|
|
err = -EBUSY;
|
2021-04-30 13:22:47 +08:00
|
|
|
} else {
|
2015-06-11 20:13:56 +08:00
|
|
|
reserve_ds_buffers();
|
2021-04-30 13:22:47 +08:00
|
|
|
reserve_lbr_buffers();
|
|
|
|
}
|
2015-06-11 20:13:56 +08:00
|
|
|
}
|
|
|
|
if (!err)
|
2015-06-09 18:03:26 +08:00
|
|
|
atomic_inc(&pmc_refcount);
|
2015-06-11 20:13:56 +08:00
|
|
|
mutex_unlock(&pmc_reserve_mutex);
|
|
|
|
}
|
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
void x86_release_hardware(void)
|
|
|
|
{
|
2015-06-09 18:03:26 +08:00
|
|
|
if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
|
2015-06-11 20:13:56 +08:00
|
|
|
release_pmc_hardware();
|
|
|
|
release_ds_buffers();
|
2020-07-03 20:49:29 +08:00
|
|
|
release_lbr_buffers();
|
2015-06-11 20:13:56 +08:00
|
|
|
mutex_unlock(&pmc_reserve_mutex);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-14 20:18:20 +08:00
|
|
|
/*
|
|
|
|
* Check if we can create event of a certain type (that no conflicting events
|
|
|
|
* are present).
|
|
|
|
*/
|
|
|
|
int x86_add_exclusive(unsigned int what)
|
|
|
|
{
|
2015-06-24 22:47:50 +08:00
|
|
|
int i;
|
2015-01-14 20:18:20 +08:00
|
|
|
|
2016-12-09 08:14:17 +08:00
|
|
|
/*
|
|
|
|
* When lbr_pt_coexist we allow PT to coexist with either LBR or BTS.
|
|
|
|
* LBR and BTS are still mutually exclusive.
|
|
|
|
*/
|
|
|
|
if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt)
|
2019-12-10 18:51:01 +08:00
|
|
|
goto out;
|
2016-04-28 23:35:46 +08:00
|
|
|
|
2015-06-24 22:47:50 +08:00
|
|
|
if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
|
|
|
|
mutex_lock(&pmc_reserve_mutex);
|
|
|
|
for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
|
|
|
|
if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
|
|
|
|
goto fail_unlock;
|
|
|
|
}
|
|
|
|
atomic_inc(&x86_pmu.lbr_exclusive[what]);
|
|
|
|
mutex_unlock(&pmc_reserve_mutex);
|
2015-06-11 20:13:56 +08:00
|
|
|
}
|
2015-01-14 20:18:20 +08:00
|
|
|
|
2019-12-10 18:51:01 +08:00
|
|
|
out:
|
2015-06-24 22:47:50 +08:00
|
|
|
atomic_inc(&active_events);
|
|
|
|
return 0;
|
2015-01-14 20:18:20 +08:00
|
|
|
|
2015-06-24 22:47:50 +08:00
|
|
|
fail_unlock:
|
2015-01-14 20:18:20 +08:00
|
|
|
mutex_unlock(&pmc_reserve_mutex);
|
2015-06-24 22:47:50 +08:00
|
|
|
return -EBUSY;
|
2015-01-14 20:18:20 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void x86_del_exclusive(unsigned int what)
|
|
|
|
{
|
2019-12-10 18:51:01 +08:00
|
|
|
atomic_dec(&active_events);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* See the comment in x86_add_exclusive().
|
|
|
|
*/
|
2016-12-09 08:14:17 +08:00
|
|
|
if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt)
|
2016-04-28 23:35:46 +08:00
|
|
|
return;
|
|
|
|
|
2015-01-14 20:18:20 +08:00
|
|
|
atomic_dec(&x86_pmu.lbr_exclusive[what]);
|
|
|
|
}
|
|
|
|
|
2011-08-31 07:41:05 +08:00
|
|
|
int x86_setup_perfctr(struct perf_event *event)
|
2010-04-14 04:23:11 +08:00
|
|
|
{
|
|
|
|
struct perf_event_attr *attr = &event->attr;
|
|
|
|
struct hw_perf_event *hwc = &event->hw;
|
|
|
|
u64 config;
|
|
|
|
|
2010-11-23 23:21:43 +08:00
|
|
|
if (!is_sampling_event(event)) {
|
2010-04-14 04:23:11 +08:00
|
|
|
hwc->sample_period = x86_pmu.max_period;
|
|
|
|
hwc->last_period = hwc->sample_period;
|
2010-05-21 20:43:08 +08:00
|
|
|
local64_set(&hwc->period_left, hwc->sample_period);
|
2010-04-14 04:23:11 +08:00
|
|
|
}
|
|
|
|
|
perf/x86: Register hybrid PMUs
Different hybrid PMUs have different PMU capabilities and events. Perf
should registers a dedicated PMU for each of them.
To check the X86 event, perf has to go through all possible hybrid pmus.
All the hybrid PMUs are registered at boot time. Before the
registration, add intel_pmu_check_hybrid_pmus() to check and update the
counters information, the event constraints, the extra registers and the
unique capabilities for each hybrid PMUs.
Postpone the display of the PMU information and HW check to
CPU_STARTING, because the boot CPU is the only online CPU in the
init_hw_perf_events(). Perf doesn't know the availability of the other
PMUs. Perf should display the PMU information only if the counters of
the PMU are available.
One type of CPUs may be all offline. For this case, users can still
observe the PMU in /sys/devices, but its CPU mask is 0.
All hybrid PMUs have capability PERF_PMU_CAP_HETEROGENEOUS_CPUS.
The PMU name for hybrid PMUs will be "cpu_XXX", which will be assigned
later in a separated patch.
The PMU type id for the core PMU is still PERF_TYPE_RAW. For the other
hybrid PMUs, the PMU type id is not hard code.
The event->cpu must be compatitable with the supported CPUs of the PMU.
Add a check in the x86_pmu_event_init().
The events in a group must be from the same type of hybrid PMU.
The fake cpuc used in the validation must be from the supported CPU of
the event->pmu.
Perf may not retrieve a valid core type from get_this_hybrid_cpu_type().
For example, ADL may have an alternative configuration. With that
configuration, Perf cannot retrieve the core type from the CPUID leaf
0x1a. Add a platform specific get_hybrid_cpu_type(). If the generic way
fails, invoke the platform specific get_hybrid_cpu_type().
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1618237865-33448-17-git-send-email-kan.liang@linux.intel.com
2021-04-12 22:30:56 +08:00
|
|
|
if (attr->type == event->pmu->type)
|
2011-11-14 17:03:25 +08:00
|
|
|
return x86_pmu_extra_regs(event->attr.config, event);
|
2010-04-14 04:23:11 +08:00
|
|
|
|
|
|
|
if (attr->type == PERF_TYPE_HW_CACHE)
|
2011-03-03 10:34:48 +08:00
|
|
|
return set_ext_hw_attr(hwc, event);
|
2010-04-14 04:23:11 +08:00
|
|
|
|
|
|
|
if (attr->config >= x86_pmu.max_events)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2018-04-20 20:08:58 +08:00
|
|
|
attr->config = array_index_nospec((unsigned long)attr->config, x86_pmu.max_events);
|
|
|
|
|
2010-04-14 04:23:11 +08:00
|
|
|
/*
|
|
|
|
* The generic map:
|
|
|
|
*/
|
|
|
|
config = x86_pmu.event_map(attr->config);
|
|
|
|
|
|
|
|
if (config == 0)
|
|
|
|
return -ENOENT;
|
|
|
|
|
|
|
|
if (config == -1LL)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
hwc->config |= config;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2010-04-14 04:23:10 +08:00
|
|
|
|
2012-02-10 06:20:54 +08:00
|
|
|
/*
|
|
|
|
* check that branch_sample_type is compatible with
|
|
|
|
* settings needed for precise_ip > 1 which implies
|
|
|
|
* using the LBR to capture ALL taken branches at the
|
|
|
|
* priv levels of the measurement
|
|
|
|
*/
|
|
|
|
static inline int precise_br_compat(struct perf_event *event)
|
|
|
|
{
|
|
|
|
u64 m = event->attr.branch_sample_type;
|
|
|
|
u64 b = 0;
|
|
|
|
|
|
|
|
/* must capture all branches */
|
|
|
|
if (!(m & PERF_SAMPLE_BRANCH_ANY))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
|
|
|
|
|
|
|
|
if (!event->attr.exclude_user)
|
|
|
|
b |= PERF_SAMPLE_BRANCH_USER;
|
|
|
|
|
|
|
|
if (!event->attr.exclude_kernel)
|
|
|
|
b |= PERF_SAMPLE_BRANCH_KERNEL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
|
|
|
|
*/
|
|
|
|
|
|
|
|
return m == b;
|
|
|
|
}
|
|
|
|
|
2017-08-23 02:52:01 +08:00
|
|
|
int x86_pmu_max_precise(void)
|
2010-03-12 00:54:39 +08:00
|
|
|
{
|
2017-08-23 02:52:01 +08:00
|
|
|
int precise = 0;
|
|
|
|
|
|
|
|
/* Support for constant skid */
|
|
|
|
if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
|
|
|
|
precise++;
|
2010-04-09 05:03:20 +08:00
|
|
|
|
2017-08-23 02:52:01 +08:00
|
|
|
/* Support for IP fixup */
|
|
|
|
if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
|
2010-04-09 05:03:20 +08:00
|
|
|
precise++;
|
|
|
|
|
2017-08-23 02:52:01 +08:00
|
|
|
if (x86_pmu.pebs_prec_dist)
|
|
|
|
precise++;
|
|
|
|
}
|
|
|
|
return precise;
|
|
|
|
}
|
perf/x86: Use INST_RETIRED.PREC_DIST for cycles: ppp
Add a new 'three-p' precise level, that uses INST_RETIRED.PREC_DIST as
base. The basic mechanism of abusing the inverse cmask to get all
cycles works the same as before.
PREC_DIST is available on Sandy Bridge or later. It had some problems
on Sandy Bridge, so we only use it on IvyBridge and later. I tested it
on Broadwell and Skylake.
PREC_DIST has special support for avoiding shadow effects, which can
give better results compare to UOPS_RETIRED. The drawback is that
PREC_DIST can only schedule on counter 1, but that is ok for cycle
sampling, as there is normally no need to do multiple cycle sampling
runs in parallel. It is still possible to run perf top in parallel, as
that doesn't use precise mode. Also of course the multiplexing can
still allow parallel operation.
:pp stays with the previous event.
Example:
Sample a loop with 10 sqrt with old cycles:pp
0.14 │10: sqrtps %xmm1,%xmm0 <--------------
9.13 │ sqrtps %xmm1,%xmm0
11.58 │ sqrtps %xmm1,%xmm0
11.51 │ sqrtps %xmm1,%xmm0
6.27 │ sqrtps %xmm1,%xmm0
10.38 │ sqrtps %xmm1,%xmm0
12.20 │ sqrtps %xmm1,%xmm0
12.74 │ sqrtps %xmm1,%xmm0
5.40 │ sqrtps %xmm1,%xmm0
10.14 │ sqrtps %xmm1,%xmm0
10.51 │ ↑ jmp 10
We expect all 10 sqrt to get roughly the sample number of samples.
But you can see that the instruction directly after the JMP is
systematically underestimated in the result, due to sampling shadow
effects.
With the new PREC_DIST based sampling this problem is gone and all
instructions show up roughly evenly:
9.51 │10: sqrtps %xmm1,%xmm0
11.74 │ sqrtps %xmm1,%xmm0
11.84 │ sqrtps %xmm1,%xmm0
6.05 │ sqrtps %xmm1,%xmm0
10.46 │ sqrtps %xmm1,%xmm0
12.25 │ sqrtps %xmm1,%xmm0
12.18 │ sqrtps %xmm1,%xmm0
5.26 │ sqrtps %xmm1,%xmm0
10.13 │ sqrtps %xmm1,%xmm0
10.43 │ sqrtps %xmm1,%xmm0
0.16 │ ↑ jmp 10
Even with PREC_DIST there is still sampling skid and the result is not
completely even, but systematic shadow effects are significantly
reduced.
The improvements are mainly expected to make a difference in high IPC
code. With low IPC it should be similar.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: hpa@zytor.com
Link: http://lkml.kernel.org/r/1448929689-13771-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-12-04 19:50:52 +08:00
|
|
|
|
2017-08-23 02:52:01 +08:00
|
|
|
int x86_pmu_hw_config(struct perf_event *event)
|
|
|
|
{
|
|
|
|
if (event->attr.precise_ip) {
|
|
|
|
int precise = x86_pmu_max_precise();
|
2010-04-09 05:03:20 +08:00
|
|
|
|
|
|
|
if (event->attr.precise_ip > precise)
|
|
|
|
return -EOPNOTSUPP;
|
2017-01-03 22:24:54 +08:00
|
|
|
|
|
|
|
/* There's no sense in having PEBS for non sampling events: */
|
|
|
|
if (!is_sampling_event(event))
|
|
|
|
return -EINVAL;
|
2014-11-05 10:56:08 +08:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* check that PEBS LBR correction does not conflict with
|
|
|
|
* whatever the user is asking with attr->branch_sample_type
|
|
|
|
*/
|
|
|
|
if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
|
|
|
|
u64 *br_type = &event->attr.branch_sample_type;
|
|
|
|
|
|
|
|
if (has_branch_stack(event)) {
|
|
|
|
if (!precise_br_compat(event))
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
|
|
|
/* branch_sample_type is compatible */
|
|
|
|
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* user did not specify branch_sample_type
|
|
|
|
*
|
|
|
|
* For PEBS fixups, we capture all
|
|
|
|
* the branches at the priv level of the
|
|
|
|
* event.
|
|
|
|
*/
|
|
|
|
*br_type = PERF_SAMPLE_BRANCH_ANY;
|
|
|
|
|
|
|
|
if (!event->attr.exclude_user)
|
|
|
|
*br_type |= PERF_SAMPLE_BRANCH_USER;
|
|
|
|
|
|
|
|
if (!event->attr.exclude_kernel)
|
|
|
|
*br_type |= PERF_SAMPLE_BRANCH_KERNEL;
|
2012-02-10 06:20:54 +08:00
|
|
|
}
|
2010-04-09 05:03:20 +08:00
|
|
|
}
|
|
|
|
|
2014-11-05 10:56:03 +08:00
|
|
|
if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
|
|
|
|
event->attach_state |= PERF_ATTACH_TASK_DATA;
|
|
|
|
|
2010-03-12 00:54:39 +08:00
|
|
|
/*
|
|
|
|
* Generate PMC IRQs:
|
|
|
|
* (keep 'enabled' bit clear for now)
|
|
|
|
*/
|
2010-03-30 23:00:06 +08:00
|
|
|
event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
|
2010-03-12 00:54:39 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Count user and OS events unless requested not to
|
|
|
|
*/
|
2010-03-30 23:00:06 +08:00
|
|
|
if (!event->attr.exclude_user)
|
|
|
|
event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
|
|
|
|
if (!event->attr.exclude_kernel)
|
|
|
|
event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
|
2010-03-12 00:54:39 +08:00
|
|
|
|
perf/x86: Register hybrid PMUs
Different hybrid PMUs have different PMU capabilities and events. Perf
should registers a dedicated PMU for each of them.
To check the X86 event, perf has to go through all possible hybrid pmus.
All the hybrid PMUs are registered at boot time. Before the
registration, add intel_pmu_check_hybrid_pmus() to check and update the
counters information, the event constraints, the extra registers and the
unique capabilities for each hybrid PMUs.
Postpone the display of the PMU information and HW check to
CPU_STARTING, because the boot CPU is the only online CPU in the
init_hw_perf_events(). Perf doesn't know the availability of the other
PMUs. Perf should display the PMU information only if the counters of
the PMU are available.
One type of CPUs may be all offline. For this case, users can still
observe the PMU in /sys/devices, but its CPU mask is 0.
All hybrid PMUs have capability PERF_PMU_CAP_HETEROGENEOUS_CPUS.
The PMU name for hybrid PMUs will be "cpu_XXX", which will be assigned
later in a separated patch.
The PMU type id for the core PMU is still PERF_TYPE_RAW. For the other
hybrid PMUs, the PMU type id is not hard code.
The event->cpu must be compatitable with the supported CPUs of the PMU.
Add a check in the x86_pmu_event_init().
The events in a group must be from the same type of hybrid PMU.
The fake cpuc used in the validation must be from the supported CPU of
the event->pmu.
Perf may not retrieve a valid core type from get_this_hybrid_cpu_type().
For example, ADL may have an alternative configuration. With that
configuration, Perf cannot retrieve the core type from the CPUID leaf
0x1a. Add a platform specific get_hybrid_cpu_type(). If the generic way
fails, invoke the platform specific get_hybrid_cpu_type().
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1618237865-33448-17-git-send-email-kan.liang@linux.intel.com
2021-04-12 22:30:56 +08:00
|
|
|
if (event->attr.type == event->pmu->type)
|
2010-03-30 23:00:06 +08:00
|
|
|
event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
|
2010-03-12 00:54:39 +08:00
|
|
|
|
perf/x86/intel: Add INST_RETIRED.ALL workarounds
On Broadwell INST_RETIRED.ALL cannot be used with any period
that doesn't have the lowest 6 bits cleared. And the period
should not be smaller than 128.
This is erratum BDM11 and BDM55:
http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/5th-gen-core-family-spec-update.pdf
BDM11: When using a period < 100; we may get incorrect PEBS/PMI
interrupts and/or an invalid counter state.
BDM55: When bit0-5 of the period are !0 we may get redundant PEBS
records on overflow.
Add a new callback to enforce this, and set it for Broadwell.
How does this handle the case when an app requests a specific
period with some of the bottom bits set?
Short answer:
Any useful instruction sampling period needs to be 4-6 orders
of magnitude larger than 128, as an PMI every 128 instructions
would instantly overwhelm the system and be throttled.
So the +-64 error from this is really small compared to the
period, much smaller than normal system jitter.
Long answer (by Peterz):
IFF we guarantee perf_event_attr::sample_period >= 128.
Suppose we start out with sample_period=192; then we'll set period_left
to 192, we'll end up with left = 128 (we truncate the lower bits). We
get an interrupt, find that period_left = 64 (>0 so we return 0 and
don't get an overflow handler), up that to 128. Then we trigger again,
at n=256. Then we find period_left = -64 (<=0 so we return 1 and do get
an overflow). We increment with sample_period so we get left = 128. We
fire again, at n=384, period_left = 0 (<=0 so we return 1 and get an
overflow). And on and on.
So while the individual interrupts are 'wrong' we get then with
interval=256,128 in exactly the right ratio to average out at 192. And
this works for everything >=128.
So the num_samples*fixed_period thing is still entirely correct +- 127,
which is good enough I'd say, as you already have that error anyhow.
So no need to 'fix' the tools, al we need to do is refuse to create
INST_RETIRED:ALL events with sample_period < 128.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
[ Updated comments and changelog a bit. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1424225886-18652-3-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-02-18 10:18:06 +08:00
|
|
|
if (event->attr.sample_period && x86_pmu.limit_period) {
|
2022-05-11 03:28:25 +08:00
|
|
|
s64 left = event->attr.sample_period;
|
|
|
|
x86_pmu.limit_period(event, &left);
|
|
|
|
if (left > event->attr.sample_period)
|
perf/x86/intel: Add INST_RETIRED.ALL workarounds
On Broadwell INST_RETIRED.ALL cannot be used with any period
that doesn't have the lowest 6 bits cleared. And the period
should not be smaller than 128.
This is erratum BDM11 and BDM55:
http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/5th-gen-core-family-spec-update.pdf
BDM11: When using a period < 100; we may get incorrect PEBS/PMI
interrupts and/or an invalid counter state.
BDM55: When bit0-5 of the period are !0 we may get redundant PEBS
records on overflow.
Add a new callback to enforce this, and set it for Broadwell.
How does this handle the case when an app requests a specific
period with some of the bottom bits set?
Short answer:
Any useful instruction sampling period needs to be 4-6 orders
of magnitude larger than 128, as an PMI every 128 instructions
would instantly overwhelm the system and be throttled.
So the +-64 error from this is really small compared to the
period, much smaller than normal system jitter.
Long answer (by Peterz):
IFF we guarantee perf_event_attr::sample_period >= 128.
Suppose we start out with sample_period=192; then we'll set period_left
to 192, we'll end up with left = 128 (we truncate the lower bits). We
get an interrupt, find that period_left = 64 (>0 so we return 0 and
don't get an overflow handler), up that to 128. Then we trigger again,
at n=256. Then we find period_left = -64 (<=0 so we return 1 and do get
an overflow). We increment with sample_period so we get left = 128. We
fire again, at n=384, period_left = 0 (<=0 so we return 1 and get an
overflow). And on and on.
So while the individual interrupts are 'wrong' we get then with
interval=256,128 in exactly the right ratio to average out at 192. And
this works for everything >=128.
So the num_samples*fixed_period thing is still entirely correct +- 127,
which is good enough I'd say, as you already have that error anyhow.
So no need to 'fix' the tools, al we need to do is refuse to create
INST_RETIRED:ALL events with sample_period < 128.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
[ Updated comments and changelog a bit. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1424225886-18652-3-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-02-18 10:18:06 +08:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2019-04-03 03:44:59 +08:00
|
|
|
/* sample_regs_user never support XMM registers */
|
2019-05-29 06:08:32 +08:00
|
|
|
if (unlikely(event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK))
|
2019-04-03 03:44:59 +08:00
|
|
|
return -EINVAL;
|
|
|
|
/*
|
|
|
|
* Besides the general purpose registers, XMM registers may
|
|
|
|
* be collected in PEBS on some platforms, e.g. Icelake
|
|
|
|
*/
|
2019-05-29 06:08:32 +08:00
|
|
|
if (unlikely(event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK)) {
|
2019-05-29 06:08:33 +08:00
|
|
|
if (!(event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS))
|
2019-04-03 03:44:59 +08:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (!event->attr.precise_ip)
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2010-04-14 04:23:12 +08:00
|
|
|
return x86_setup_perfctr(event);
|
2010-03-30 17:28:21 +08:00
|
|
|
}
|
|
|
|
|
2008-12-03 17:39:53 +08:00
|
|
|
/*
|
2009-06-03 01:22:16 +08:00
|
|
|
* Setup the hardware configuration for a given attr_type
|
2008-12-03 17:39:53 +08:00
|
|
|
*/
|
2010-06-11 19:35:08 +08:00
|
|
|
static int __x86_pmu_event_init(struct perf_event *event)
|
2008-12-03 17:39:53 +08:00
|
|
|
{
|
2009-03-31 01:07:16 +08:00
|
|
|
int err;
|
2008-12-03 17:39:53 +08:00
|
|
|
|
2009-04-29 18:47:20 +08:00
|
|
|
if (!x86_pmu_initialized())
|
|
|
|
return -ENODEV;
|
2008-12-03 17:39:53 +08:00
|
|
|
|
2015-06-11 20:13:56 +08:00
|
|
|
err = x86_reserve_hardware();
|
2009-03-31 01:07:16 +08:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
2015-06-09 18:03:26 +08:00
|
|
|
atomic_inc(&active_events);
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
event->destroy = hw_perf_event_destroy;
|
2009-09-09 16:04:47 +08:00
|
|
|
|
2010-04-14 04:23:10 +08:00
|
|
|
event->hw.idx = -1;
|
|
|
|
event->hw.last_cpu = -1;
|
|
|
|
event->hw.last_tag = ~0ULL;
|
2009-10-06 22:42:09 +08:00
|
|
|
|
2011-06-06 22:57:03 +08:00
|
|
|
/* mark unused */
|
|
|
|
event->hw.extra_reg.idx = EXTRA_REG_NONE;
|
2012-02-10 06:20:53 +08:00
|
|
|
event->hw.branch_reg.idx = EXTRA_REG_NONE;
|
|
|
|
|
2010-04-14 04:23:12 +08:00
|
|
|
return x86_pmu.hw_config(event);
|
2010-04-14 04:23:10 +08:00
|
|
|
}
|
|
|
|
|
2011-08-31 07:41:05 +08:00
|
|
|
void x86_pmu_disable_all(void)
|
x86: AMD Support for perf_counter
Supported basic performance counter for AMD K7 and later:
$ perfstat -e 0,1,2,3,4,5,-1,-2,-3,-4,-5 ls > /dev/null
Performance counter stats for 'ls':
12.298610 task clock ticks (msecs)
3298477 CPU cycles (events)
1406354 instructions (events)
749035 cache references (events)
16939 cache misses (events)
100589 branches (events)
11159 branch misses (events)
7.627540 cpu clock ticks (msecs)
12.298610 task clock ticks (msecs)
500 pagefaults (events)
6 context switches (events)
3 CPU migrations (events)
Wall-clock time elapsed: 8.672290 msecs
Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-02-27 22:45:14 +08:00
|
|
|
{
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
2009-05-13 22:21:38 +08:00
|
|
|
int idx;
|
|
|
|
|
2010-03-30 00:36:50 +08:00
|
|
|
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
|
perf/x86/amd: Add support for Large Increment per Cycle Events
Description of hardware operation
---------------------------------
The core AMD PMU has a 4-bit wide per-cycle increment for each
performance monitor counter. That works for most events, but
now with AMD Family 17h and above processors, some events can
occur more than 15 times in a cycle. Those events are called
"Large Increment per Cycle" events. In order to count these
events, two adjacent h/w PMCs get their count signals merged
to form 8 bits per cycle total. In addition, the PERF_CTR count
registers are merged to be able to count up to 64 bits.
Normally, events like instructions retired, get programmed on a single
counter like so:
PERF_CTL0 (MSR 0xc0010200) 0x000000000053ff0c # event 0x0c, umask 0xff
PERF_CTR0 (MSR 0xc0010201) 0x0000800000000001 # r/w 48-bit count
The next counter at MSRs 0xc0010202-3 remains unused, or can be used
independently to count something else.
When counting Large Increment per Cycle events, such as FLOPs,
however, we now have to reserve the next counter and program the
PERF_CTL (config) register with the Merge event (0xFFF), like so:
PERF_CTL0 (msr 0xc0010200) 0x000000000053ff03 # FLOPs event, umask 0xff
PERF_CTR0 (msr 0xc0010201) 0x0000800000000001 # rd 64-bit cnt, wr lo 48b
PERF_CTL1 (msr 0xc0010202) 0x0000000f004000ff # Merge event, enable bit
PERF_CTR1 (msr 0xc0010203) 0x0000000000000000 # wr hi 16-bits count
The count is widened from the normal 48-bits to 64 bits by having the
second counter carry the higher 16 bits of the count in its lower 16
bits of its counter register.
The odd counter, e.g., PERF_CTL1, is programmed with the enabled Merge
event before the even counter, PERF_CTL0.
The Large Increment feature is available starting with Family 17h.
For more details, search any Family 17h PPR for the "Large Increment
per Cycle Events" section, e.g., section 2.1.15.3 on p. 173 in this
version:
https://www.amd.com/system/files/TechDocs/56176_ppr_Family_17h_Model_71h_B0_pub_Rev_3.06.zip
Description of software operation
---------------------------------
The following steps are taken in order to support reserving and
enabling the extra counter for Large Increment per Cycle events:
1. In the main x86 scheduler, we reduce the number of available
counters by the number of Large Increment per Cycle events being
scheduled, tracked by a new cpuc variable 'n_pair' and a new
amd_put_event_constraints_f17h(). This improves the counter
scheduler success rate.
2. In perf_assign_events(), if a counter is assigned to a Large
Increment event, we increment the current counter variable, so the
counter used for the Merge event is removed from assignment
consideration by upcoming event assignments.
3. In find_counter(), if a counter has been found for the Large
Increment event, we set the next counter as used, to prevent other
events from using it.
4. We perform steps 2 & 3 also in the x86 scheduler fastpath, i.e.,
we add Merge event accounting to the existing used_mask logic.
5. Finally, we add on the programming of Merge event to the
neighbouring PMC counters in the counter enable/disable{_all}
code paths.
Currently, software does not support a single PMU with mixed 48- and
64-bit counting, so Large increment event counts are limited to 48
bits. In set_period, we zero-out the upper 16 bits of the count, so
the hardware doesn't copy them to the even counter's higher bits.
Simple invocation example showing counting 8 FLOPs per 256-bit/%ymm
vaddps instruction executed in a loop 100 million times:
perf stat -e cpu/fp_ret_sse_avx_ops.all/,cpu/instructions/ <workload>
Performance counter stats for '<workload>':
800,000,000 cpu/fp_ret_sse_avx_ops.all/u
300,042,101 cpu/instructions/u
Prior to this patch, the reported SSE/AVX FLOPs retired count would
be wrong.
[peterz: lots of renames and edits to the code]
Signed-off-by: Kim Phillips <kim.phillips@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2019-11-15 02:37:20 +08:00
|
|
|
struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
|
2009-03-06 01:08:27 +08:00
|
|
|
u64 val;
|
|
|
|
|
2009-04-29 22:55:56 +08:00
|
|
|
if (!test_bit(idx, cpuc->active_mask))
|
2009-04-29 18:47:01 +08:00
|
|
|
continue;
|
2011-02-03 00:40:57 +08:00
|
|
|
rdmsrl(x86_pmu_config_addr(idx), val);
|
2010-03-01 21:21:23 +08:00
|
|
|
if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
|
2009-04-29 18:47:01 +08:00
|
|
|
continue;
|
2010-03-01 21:21:23 +08:00
|
|
|
val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
|
2011-02-03 00:40:57 +08:00
|
|
|
wrmsrl(x86_pmu_config_addr(idx), val);
|
perf/x86/amd: Add support for Large Increment per Cycle Events
Description of hardware operation
---------------------------------
The core AMD PMU has a 4-bit wide per-cycle increment for each
performance monitor counter. That works for most events, but
now with AMD Family 17h and above processors, some events can
occur more than 15 times in a cycle. Those events are called
"Large Increment per Cycle" events. In order to count these
events, two adjacent h/w PMCs get their count signals merged
to form 8 bits per cycle total. In addition, the PERF_CTR count
registers are merged to be able to count up to 64 bits.
Normally, events like instructions retired, get programmed on a single
counter like so:
PERF_CTL0 (MSR 0xc0010200) 0x000000000053ff0c # event 0x0c, umask 0xff
PERF_CTR0 (MSR 0xc0010201) 0x0000800000000001 # r/w 48-bit count
The next counter at MSRs 0xc0010202-3 remains unused, or can be used
independently to count something else.
When counting Large Increment per Cycle events, such as FLOPs,
however, we now have to reserve the next counter and program the
PERF_CTL (config) register with the Merge event (0xFFF), like so:
PERF_CTL0 (msr 0xc0010200) 0x000000000053ff03 # FLOPs event, umask 0xff
PERF_CTR0 (msr 0xc0010201) 0x0000800000000001 # rd 64-bit cnt, wr lo 48b
PERF_CTL1 (msr 0xc0010202) 0x0000000f004000ff # Merge event, enable bit
PERF_CTR1 (msr 0xc0010203) 0x0000000000000000 # wr hi 16-bits count
The count is widened from the normal 48-bits to 64 bits by having the
second counter carry the higher 16 bits of the count in its lower 16
bits of its counter register.
The odd counter, e.g., PERF_CTL1, is programmed with the enabled Merge
event before the even counter, PERF_CTL0.
The Large Increment feature is available starting with Family 17h.
For more details, search any Family 17h PPR for the "Large Increment
per Cycle Events" section, e.g., section 2.1.15.3 on p. 173 in this
version:
https://www.amd.com/system/files/TechDocs/56176_ppr_Family_17h_Model_71h_B0_pub_Rev_3.06.zip
Description of software operation
---------------------------------
The following steps are taken in order to support reserving and
enabling the extra counter for Large Increment per Cycle events:
1. In the main x86 scheduler, we reduce the number of available
counters by the number of Large Increment per Cycle events being
scheduled, tracked by a new cpuc variable 'n_pair' and a new
amd_put_event_constraints_f17h(). This improves the counter
scheduler success rate.
2. In perf_assign_events(), if a counter is assigned to a Large
Increment event, we increment the current counter variable, so the
counter used for the Merge event is removed from assignment
consideration by upcoming event assignments.
3. In find_counter(), if a counter has been found for the Large
Increment event, we set the next counter as used, to prevent other
events from using it.
4. We perform steps 2 & 3 also in the x86 scheduler fastpath, i.e.,
we add Merge event accounting to the existing used_mask logic.
5. Finally, we add on the programming of Merge event to the
neighbouring PMC counters in the counter enable/disable{_all}
code paths.
Currently, software does not support a single PMU with mixed 48- and
64-bit counting, so Large increment event counts are limited to 48
bits. In set_period, we zero-out the upper 16 bits of the count, so
the hardware doesn't copy them to the even counter's higher bits.
Simple invocation example showing counting 8 FLOPs per 256-bit/%ymm
vaddps instruction executed in a loop 100 million times:
perf stat -e cpu/fp_ret_sse_avx_ops.all/,cpu/instructions/ <workload>
Performance counter stats for '<workload>':
800,000,000 cpu/fp_ret_sse_avx_ops.all/u
300,042,101 cpu/instructions/u
Prior to this patch, the reported SSE/AVX FLOPs retired count would
be wrong.
[peterz: lots of renames and edits to the code]
Signed-off-by: Kim Phillips <kim.phillips@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2019-11-15 02:37:20 +08:00
|
|
|
if (is_counter_pair(hwc))
|
|
|
|
wrmsrl(x86_pmu_config_addr(idx + 1), 0);
|
x86: AMD Support for perf_counter
Supported basic performance counter for AMD K7 and later:
$ perfstat -e 0,1,2,3,4,5,-1,-2,-3,-4,-5 ls > /dev/null
Performance counter stats for 'ls':
12.298610 task clock ticks (msecs)
3298477 CPU cycles (events)
1406354 instructions (events)
749035 cache references (events)
16939 cache misses (events)
100589 branches (events)
11159 branch misses (events)
7.627540 cpu clock ticks (msecs)
12.298610 task clock ticks (msecs)
500 pagefaults (events)
6 context switches (events)
3 CPU migrations (events)
Wall-clock time elapsed: 8.672290 msecs
Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-02-27 22:45:14 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-04-11 18:19:32 +08:00
|
|
|
struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data)
|
2021-01-25 20:14:58 +08:00
|
|
|
{
|
2022-04-11 18:19:32 +08:00
|
|
|
return static_call(x86_pmu_guest_get_msrs)(nr, data);
|
2021-01-25 20:14:58 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
|
|
|
|
|
perf/x86/intel: Fix PEBS warning by only restoring active PMU in pmi
This patch tries to fix a PEBS warning found in my stress test. The
following perf command can easily trigger the pebs warning or spurious
NMI error on Skylake/Broadwell/Haswell platforms:
sudo perf record -e 'cpu/umask=0x04,event=0xc4/pp,cycles,branches,ref-cycles,cache-misses,cache-references' --call-graph fp -b -c1000 -a
Also the NMI watchdog must be enabled.
For this case, the events number is larger than counter number. So
perf has to do multiplexing.
In perf_mux_hrtimer_handler, it does perf_pmu_disable(), schedule out
old events, rotate_ctx, schedule in new events and finally
perf_pmu_enable().
If the old events include precise event, the MSR_IA32_PEBS_ENABLE
should be cleared when perf_pmu_disable(). The MSR_IA32_PEBS_ENABLE
should keep 0 until the perf_pmu_enable() is called and the new event is
precise event.
However, there is a corner case which could restore PEBS_ENABLE to
stale value during the above period. In perf_pmu_disable(), GLOBAL_CTRL
will be set to 0 to stop overflow and followed PMI. But there may be
pending PMI from an earlier overflow, which cannot be stopped. So even
GLOBAL_CTRL is cleared, the kernel still be possible to get PMI. At
the end of the PMI handler, __intel_pmu_enable_all() will be called,
which will restore the stale values if old events haven't scheduled
out.
Once the stale pebs value is set, it's impossible to be corrected if
the new events are non-precise. Because the pebs_enabled will be set
to 0. x86_pmu.enable_all() will ignore the MSR_IA32_PEBS_ENABLE
setting. As a result, the following NMI with stale PEBS_ENABLE
trigger pebs warning.
The pending PMI after enabled=0 will become harmless if the NMI handler
does not change the state. This patch checks cpuc->enabled in pmi and
only restore the state when PMU is active.
Here is the dump:
Call Trace:
<NMI> [<ffffffff813c3a2e>] dump_stack+0x63/0x85
[<ffffffff810a46f2>] warn_slowpath_common+0x82/0xc0
[<ffffffff810a483a>] warn_slowpath_null+0x1a/0x20
[<ffffffff8100fe2e>] intel_pmu_drain_pebs_nhm+0x2be/0x320
[<ffffffff8100caa9>] intel_pmu_handle_irq+0x279/0x460
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
[<ffffffff811f290d>] ? vunmap_page_range+0x20d/0x330
[<ffffffff811f2f11>] ? unmap_kernel_range_noflush+0x11/0x20
[<ffffffff8148379f>] ? ghes_copy_tofrom_phys+0x10f/0x2a0
[<ffffffff814839c8>] ? ghes_read_estatus+0x98/0x170
[<ffffffff81005a7d>] perf_event_nmi_handler+0x2d/0x50
[<ffffffff810310b9>] nmi_handle+0x69/0x120
[<ffffffff810316f6>] default_do_nmi+0xe6/0x100
[<ffffffff810317f2>] do_nmi+0xe2/0x130
[<ffffffff817aea71>] end_repeat_nmi+0x1a/0x1e
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
[<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
<<EOE>> <IRQ> [<ffffffff81006df8>] ? x86_perf_event_set_period+0xd8/0x180
[<ffffffff81006eec>] x86_pmu_start+0x4c/0x100
[<ffffffff8100722d>] x86_pmu_enable+0x28d/0x300
[<ffffffff811994d7>] perf_pmu_enable.part.81+0x7/0x10
[<ffffffff8119cb70>] perf_mux_hrtimer_handler+0x200/0x280
[<ffffffff8119c970>] ? __perf_install_in_context+0xc0/0xc0
[<ffffffff8110f92d>] __hrtimer_run_queues+0xfd/0x280
[<ffffffff811100d8>] hrtimer_interrupt+0xa8/0x190
[<ffffffff81199080>] ? __perf_read_group_add.part.61+0x1a0/0x1a0
[<ffffffff81051bd8>] local_apic_timer_interrupt+0x38/0x60
[<ffffffff817af01d>] smp_apic_timer_interrupt+0x3d/0x50
[<ffffffff817ad15c>] apic_timer_interrupt+0x8c/0xa0
<EOI> [<ffffffff81199080>] ? __perf_read_group_add.part.61+0x1a0/0x1a0
[<ffffffff81123de5>] ? smp_call_function_single+0xd5/0x130
[<ffffffff81123ddb>] ? smp_call_function_single+0xcb/0x130
[<ffffffff81199080>] ? __perf_read_group_add.part.61+0x1a0/0x1a0
[<ffffffff8119765a>] event_function_call+0x10a/0x120
[<ffffffff8119c660>] ? ctx_resched+0x90/0x90
[<ffffffff811971e0>] ? cpu_clock_event_read+0x30/0x30
[<ffffffff811976d0>] ? _perf_event_disable+0x60/0x60
[<ffffffff8119772b>] _perf_event_enable+0x5b/0x70
[<ffffffff81197388>] perf_event_for_each_child+0x38/0xa0
[<ffffffff811976d0>] ? _perf_event_disable+0x60/0x60
[<ffffffff811a0ffd>] perf_ioctl+0x12d/0x3c0
[<ffffffff8134d855>] ? selinux_file_ioctl+0x95/0x1e0
[<ffffffff8124a3a1>] do_vfs_ioctl+0xa1/0x5a0
[<ffffffff81036d29>] ? sched_clock+0x9/0x10
[<ffffffff8124a919>] SyS_ioctl+0x79/0x90
[<ffffffff817ac4b2>] entry_SYSCALL_64_fastpath+0x1a/0xa4
---[ end trace aef202839fe9a71d ]---
Uhhuh. NMI received for unknown reason 2d on CPU 2.
Do you have a strange power saving mode enabled?
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1457046448-6184-1-git-send-email-kan.liang@intel.com
[ Fixed various typos and other small details. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-03-04 07:07:28 +08:00
|
|
|
/*
|
|
|
|
* There may be PMI landing after enabled=0. The PMI hitting could be before or
|
|
|
|
* after disable_all.
|
|
|
|
*
|
|
|
|
* If PMI hits before disable_all, the PMU will be disabled in the NMI handler.
|
|
|
|
* It will not be re-enabled in the NMI handler again, because enabled=0. After
|
|
|
|
* handling the NMI, disable_all will be called, which will not change the
|
|
|
|
* state either. If PMI hits after disable_all, the PMU is already disabled
|
|
|
|
* before entering NMI handler. The NMI handler will not change the state
|
|
|
|
* either.
|
|
|
|
*
|
|
|
|
* So either situation is harmless.
|
|
|
|
*/
|
perf: Rework the PMU methods
Replace pmu::{enable,disable,start,stop,unthrottle} with
pmu::{add,del,start,stop}, all of which take a flags argument.
The new interface extends the capability to stop a counter while
keeping it scheduled on the PMU. We replace the throttled state with
the generic stopped state.
This also allows us to efficiently stop/start counters over certain
code paths (like IRQ handlers).
It also allows scheduling a counter without it starting, allowing for
a generic frozen state (useful for rotating stopped counters).
The stopped state is implemented in two different ways, depending on
how the architecture implemented the throttled state:
1) We disable the counter:
a) the pmu has per-counter enable bits, we flip that
b) we program a NOP event, preserving the counter state
2) We store the counter state and ignore all read/overflow events
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-06-16 20:37:10 +08:00
|
|
|
static void x86_pmu_disable(struct pmu *pmu)
|
2009-02-27 20:39:09 +08:00
|
|
|
{
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
2010-01-18 16:58:01 +08:00
|
|
|
|
2009-04-29 18:47:20 +08:00
|
|
|
if (!x86_pmu_initialized())
|
2009-05-13 22:21:38 +08:00
|
|
|
return;
|
2010-01-18 16:58:01 +08:00
|
|
|
|
2010-01-28 06:07:47 +08:00
|
|
|
if (!cpuc->enabled)
|
|
|
|
return;
|
|
|
|
|
|
|
|
cpuc->n_added = 0;
|
|
|
|
cpuc->enabled = 0;
|
|
|
|
barrier();
|
2010-01-18 16:58:01 +08:00
|
|
|
|
2020-08-18 21:57:53 +08:00
|
|
|
static_call(x86_pmu_disable_all)();
|
2009-02-27 20:39:09 +08:00
|
|
|
}
|
2008-12-03 17:39:53 +08:00
|
|
|
|
2011-08-31 07:41:05 +08:00
|
|
|
void x86_pmu_enable_all(int added)
|
x86: AMD Support for perf_counter
Supported basic performance counter for AMD K7 and later:
$ perfstat -e 0,1,2,3,4,5,-1,-2,-3,-4,-5 ls > /dev/null
Performance counter stats for 'ls':
12.298610 task clock ticks (msecs)
3298477 CPU cycles (events)
1406354 instructions (events)
749035 cache references (events)
16939 cache misses (events)
100589 branches (events)
11159 branch misses (events)
7.627540 cpu clock ticks (msecs)
12.298610 task clock ticks (msecs)
500 pagefaults (events)
6 context switches (events)
3 CPU migrations (events)
Wall-clock time elapsed: 8.672290 msecs
Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-02-27 22:45:14 +08:00
|
|
|
{
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
x86: AMD Support for perf_counter
Supported basic performance counter for AMD K7 and later:
$ perfstat -e 0,1,2,3,4,5,-1,-2,-3,-4,-5 ls > /dev/null
Performance counter stats for 'ls':
12.298610 task clock ticks (msecs)
3298477 CPU cycles (events)
1406354 instructions (events)
749035 cache references (events)
16939 cache misses (events)
100589 branches (events)
11159 branch misses (events)
7.627540 cpu clock ticks (msecs)
12.298610 task clock ticks (msecs)
500 pagefaults (events)
6 context switches (events)
3 CPU migrations (events)
Wall-clock time elapsed: 8.672290 msecs
Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-02-27 22:45:14 +08:00
|
|
|
int idx;
|
|
|
|
|
2010-03-30 00:36:50 +08:00
|
|
|
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
|
2011-02-03 00:40:56 +08:00
|
|
|
struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
|
2009-03-06 01:08:27 +08:00
|
|
|
|
2009-04-29 22:55:56 +08:00
|
|
|
if (!test_bit(idx, cpuc->active_mask))
|
2009-04-29 18:47:01 +08:00
|
|
|
continue;
|
2009-07-10 15:59:56 +08:00
|
|
|
|
2011-02-03 00:40:56 +08:00
|
|
|
__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
|
x86: AMD Support for perf_counter
Supported basic performance counter for AMD K7 and later:
$ perfstat -e 0,1,2,3,4,5,-1,-2,-3,-4,-5 ls > /dev/null
Performance counter stats for 'ls':
12.298610 task clock ticks (msecs)
3298477 CPU cycles (events)
1406354 instructions (events)
749035 cache references (events)
16939 cache misses (events)
100589 branches (events)
11159 branch misses (events)
7.627540 cpu clock ticks (msecs)
12.298610 task clock ticks (msecs)
500 pagefaults (events)
6 context switches (events)
3 CPU migrations (events)
Wall-clock time elapsed: 8.672290 msecs
Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-02-27 22:45:14 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-01-18 16:58:01 +08:00
|
|
|
static inline int is_x86_event(struct perf_event *event)
|
|
|
|
{
|
perf/x86: Register hybrid PMUs
Different hybrid PMUs have different PMU capabilities and events. Perf
should registers a dedicated PMU for each of them.
To check the X86 event, perf has to go through all possible hybrid pmus.
All the hybrid PMUs are registered at boot time. Before the
registration, add intel_pmu_check_hybrid_pmus() to check and update the
counters information, the event constraints, the extra registers and the
unique capabilities for each hybrid PMUs.
Postpone the display of the PMU information and HW check to
CPU_STARTING, because the boot CPU is the only online CPU in the
init_hw_perf_events(). Perf doesn't know the availability of the other
PMUs. Perf should display the PMU information only if the counters of
the PMU are available.
One type of CPUs may be all offline. For this case, users can still
observe the PMU in /sys/devices, but its CPU mask is 0.
All hybrid PMUs have capability PERF_PMU_CAP_HETEROGENEOUS_CPUS.
The PMU name for hybrid PMUs will be "cpu_XXX", which will be assigned
later in a separated patch.
The PMU type id for the core PMU is still PERF_TYPE_RAW. For the other
hybrid PMUs, the PMU type id is not hard code.
The event->cpu must be compatitable with the supported CPUs of the PMU.
Add a check in the x86_pmu_event_init().
The events in a group must be from the same type of hybrid PMU.
The fake cpuc used in the validation must be from the supported CPU of
the event->pmu.
Perf may not retrieve a valid core type from get_this_hybrid_cpu_type().
For example, ADL may have an alternative configuration. With that
configuration, Perf cannot retrieve the core type from the CPUID leaf
0x1a. Add a platform specific get_hybrid_cpu_type(). If the generic way
fails, invoke the platform specific get_hybrid_cpu_type().
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1618237865-33448-17-git-send-email-kan.liang@linux.intel.com
2021-04-12 22:30:56 +08:00
|
|
|
int i;
|
|
|
|
|
|
|
|
if (!is_hybrid())
|
|
|
|
return event->pmu == &pmu;
|
|
|
|
|
|
|
|
for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
|
|
|
|
if (event->pmu == &x86_pmu.hybrid_pmu[i].pmu)
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
2010-01-18 16:58:01 +08:00
|
|
|
}
|
|
|
|
|
2021-04-12 22:30:43 +08:00
|
|
|
struct pmu *x86_get_pmu(unsigned int cpu)
|
perf/x86/intel: Force resched when TFA sysctl is modified
This patch provides guarantee to the sysadmin that when TFA is disabled, no PMU
event is using PMC3 when the echo command returns. Vice-Versa, when TFA
is enabled, PMU can use PMC3 immediately (to eliminate possible multiplexing).
$ perf stat -a -I 1000 --no-merge -e branches,branches,branches,branches
1.000123979 125,768,725,208 branches
1.000562520 125,631,000,456 branches
1.000942898 125,487,114,291 branches
1.001333316 125,323,363,620 branches
2.004721306 125,514,968,546 branches
2.005114560 125,511,110,861 branches
2.005482722 125,510,132,724 branches
2.005851245 125,508,967,086 branches
3.006323475 125,166,570,648 branches
3.006709247 125,165,650,056 branches
3.007086605 125,164,639,142 branches
3.007459298 125,164,402,912 branches
4.007922698 125,045,577,140 branches
4.008310775 125,046,804,324 branches
4.008670814 125,048,265,111 branches
4.009039251 125,048,677,611 branches
5.009503373 125,122,240,217 branches
5.009897067 125,122,450,517 branches
Then on another connection, sysadmin does:
$ echo 1 >/sys/devices/cpu/allow_tsx_force_abort
Then perf stat adjusts the events immediately:
5.010286029 125,121,393,483 branches
5.010646308 125,120,556,786 branches
6.011113588 124,963,351,832 branches
6.011510331 124,964,267,566 branches
6.011889913 124,964,829,130 branches
6.012262996 124,965,841,156 branches
7.012708299 124,419,832,234 branches [79.69%]
7.012847908 124,416,363,853 branches [79.73%]
7.013225462 124,400,723,712 branches [79.73%]
7.013598191 124,376,154,434 branches [79.70%]
8.014089834 124,250,862,693 branches [74.98%]
8.014481363 124,267,539,139 branches [74.94%]
8.014856006 124,259,519,786 branches [74.98%]
8.014980848 124,225,457,969 branches [75.04%]
9.015464576 124,204,235,423 branches [75.03%]
9.015858587 124,204,988,490 branches [75.04%]
9.016243680 124,220,092,486 branches [74.99%]
9.016620104 124,231,260,146 branches [74.94%]
And vice-versa if the syadmin does:
$ echo 0 >/sys/devices/cpu/allow_tsx_force_abort
Events are again spread over the 4 counters:
10.017096277 124,276,230,565 branches [74.96%]
10.017237209 124,228,062,171 branches [75.03%]
10.017478637 124,178,780,626 branches [75.03%]
10.017853402 124,198,316,177 branches [75.03%]
11.018334423 124,602,418,933 branches [85.40%]
11.018722584 124,602,921,320 branches [85.42%]
11.019095621 124,603,956,093 branches [85.42%]
11.019467742 124,595,273,783 branches [85.42%]
12.019945736 125,110,114,864 branches
12.020330764 125,109,334,472 branches
12.020688740 125,109,818,865 branches
12.021054020 125,108,594,014 branches
13.021516774 125,109,164,018 branches
13.021903640 125,108,794,510 branches
13.022270770 125,107,756,978 branches
13.022630819 125,109,380,471 branches
14.023114989 125,133,140,817 branches
14.023501880 125,133,785,858 branches
14.023868339 125,133,852,700 branches
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: kan.liang@intel.com
Cc: nelson.dsouza@intel.com
Cc: tonyj@suse.com
Link: https://lkml.kernel.org/r/20190408173252.37932-3-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2019-04-09 01:32:52 +08:00
|
|
|
{
|
2021-04-12 22:30:43 +08:00
|
|
|
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* All CPUs of the hybrid type have been offline.
|
|
|
|
* The x86_get_pmu() should not be invoked.
|
|
|
|
*/
|
|
|
|
if (WARN_ON_ONCE(!cpuc->pmu))
|
|
|
|
return &pmu;
|
|
|
|
|
|
|
|
return cpuc->pmu;
|
perf/x86/intel: Force resched when TFA sysctl is modified
This patch provides guarantee to the sysadmin that when TFA is disabled, no PMU
event is using PMC3 when the echo command returns. Vice-Versa, when TFA
is enabled, PMU can use PMC3 immediately (to eliminate possible multiplexing).
$ perf stat -a -I 1000 --no-merge -e branches,branches,branches,branches
1.000123979 125,768,725,208 branches
1.000562520 125,631,000,456 branches
1.000942898 125,487,114,291 branches
1.001333316 125,323,363,620 branches
2.004721306 125,514,968,546 branches
2.005114560 125,511,110,861 branches
2.005482722 125,510,132,724 branches
2.005851245 125,508,967,086 branches
3.006323475 125,166,570,648 branches
3.006709247 125,165,650,056 branches
3.007086605 125,164,639,142 branches
3.007459298 125,164,402,912 branches
4.007922698 125,045,577,140 branches
4.008310775 125,046,804,324 branches
4.008670814 125,048,265,111 branches
4.009039251 125,048,677,611 branches
5.009503373 125,122,240,217 branches
5.009897067 125,122,450,517 branches
Then on another connection, sysadmin does:
$ echo 1 >/sys/devices/cpu/allow_tsx_force_abort
Then perf stat adjusts the events immediately:
5.010286029 125,121,393,483 branches
5.010646308 125,120,556,786 branches
6.011113588 124,963,351,832 branches
6.011510331 124,964,267,566 branches
6.011889913 124,964,829,130 branches
6.012262996 124,965,841,156 branches
7.012708299 124,419,832,234 branches [79.69%]
7.012847908 124,416,363,853 branches [79.73%]
7.013225462 124,400,723,712 branches [79.73%]
7.013598191 124,376,154,434 branches [79.70%]
8.014089834 124,250,862,693 branches [74.98%]
8.014481363 124,267,539,139 branches [74.94%]
8.014856006 124,259,519,786 branches [74.98%]
8.014980848 124,225,457,969 branches [75.04%]
9.015464576 124,204,235,423 branches [75.03%]
9.015858587 124,204,988,490 branches [75.04%]
9.016243680 124,220,092,486 branches [74.99%]
9.016620104 124,231,260,146 branches [74.94%]
And vice-versa if the syadmin does:
$ echo 0 >/sys/devices/cpu/allow_tsx_force_abort
Events are again spread over the 4 counters:
10.017096277 124,276,230,565 branches [74.96%]
10.017237209 124,228,062,171 branches [75.03%]
10.017478637 124,178,780,626 branches [75.03%]
10.017853402 124,198,316,177 branches [75.03%]
11.018334423 124,602,418,933 branches [85.40%]
11.018722584 124,602,921,320 branches [85.42%]
11.019095621 124,603,956,093 branches [85.42%]
11.019467742 124,595,273,783 branches [85.42%]
12.019945736 125,110,114,864 branches
12.020330764 125,109,334,472 branches
12.020688740 125,109,818,865 branches
12.021054020 125,108,594,014 branches
13.021516774 125,109,164,018 branches
13.021903640 125,108,794,510 branches
13.022270770 125,107,756,978 branches
13.022630819 125,109,380,471 branches
14.023114989 125,133,140,817 branches
14.023501880 125,133,785,858 branches
14.023868339 125,133,852,700 branches
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: kan.liang@intel.com
Cc: nelson.dsouza@intel.com
Cc: tonyj@suse.com
Link: https://lkml.kernel.org/r/20190408173252.37932-3-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2019-04-09 01:32:52 +08:00
|
|
|
}
|
2011-11-18 19:35:21 +08:00
|
|
|
/*
|
|
|
|
* Event scheduler state:
|
|
|
|
*
|
|
|
|
* Assign events iterating over all events and counters, beginning
|
|
|
|
* with events with least weights first. Keep the current iterator
|
|
|
|
* state in struct sched_state.
|
|
|
|
*/
|
|
|
|
struct sched_state {
|
|
|
|
int weight;
|
|
|
|
int event; /* event index */
|
|
|
|
int counter; /* counter index */
|
|
|
|
int unassigned; /* number of events to be assigned left */
|
2015-05-21 16:57:17 +08:00
|
|
|
int nr_gp; /* number of GP counters used */
|
perf/x86/amd: Add support for Large Increment per Cycle Events
Description of hardware operation
---------------------------------
The core AMD PMU has a 4-bit wide per-cycle increment for each
performance monitor counter. That works for most events, but
now with AMD Family 17h and above processors, some events can
occur more than 15 times in a cycle. Those events are called
"Large Increment per Cycle" events. In order to count these
events, two adjacent h/w PMCs get their count signals merged
to form 8 bits per cycle total. In addition, the PERF_CTR count
registers are merged to be able to count up to 64 bits.
Normally, events like instructions retired, get programmed on a single
counter like so:
PERF_CTL0 (MSR 0xc0010200) 0x000000000053ff0c # event 0x0c, umask 0xff
PERF_CTR0 (MSR 0xc0010201) 0x0000800000000001 # r/w 48-bit count
The next counter at MSRs 0xc0010202-3 remains unused, or can be used
independently to count something else.
When counting Large Increment per Cycle events, such as FLOPs,
however, we now have to reserve the next counter and program the
PERF_CTL (config) register with the Merge event (0xFFF), like so:
PERF_CTL0 (msr 0xc0010200) 0x000000000053ff03 # FLOPs event, umask 0xff
PERF_CTR0 (msr 0xc0010201) 0x0000800000000001 # rd 64-bit cnt, wr lo 48b
PERF_CTL1 (msr 0xc0010202) 0x0000000f004000ff # Merge event, enable bit
PERF_CTR1 (msr 0xc0010203) 0x0000000000000000 # wr hi 16-bits count
The count is widened from the normal 48-bits to 64 bits by having the
second counter carry the higher 16 bits of the count in its lower 16
bits of its counter register.
The odd counter, e.g., PERF_CTL1, is programmed with the enabled Merge
event before the even counter, PERF_CTL0.
The Large Increment feature is available starting with Family 17h.
For more details, search any Family 17h PPR for the "Large Increment
per Cycle Events" section, e.g., section 2.1.15.3 on p. 173 in this
version:
https://www.amd.com/system/files/TechDocs/56176_ppr_Family_17h_Model_71h_B0_pub_Rev_3.06.zip
Description of software operation
---------------------------------
The following steps are taken in order to support reserving and
enabling the extra counter for Large Increment per Cycle events:
1. In the main x86 scheduler, we reduce the number of available
counters by the number of Large Increment per Cycle events being
scheduled, tracked by a new cpuc variable 'n_pair' and a new
amd_put_event_constraints_f17h(). This improves the counter
scheduler success rate.
2. In perf_assign_events(), if a counter is assigned to a Large
Increment event, we increment the current counter variable, so the
counter used for the Merge event is removed from assignment
consideration by upcoming event assignments.
3. In find_counter(), if a counter has been found for the Large
Increment event, we set the next counter as used, to prevent other
events from using it.
4. We perform steps 2 & 3 also in the x86 scheduler fastpath, i.e.,
we add Merge event accounting to the existing used_mask logic.
5. Finally, we add on the programming of Merge event to the
neighbouring PMC counters in the counter enable/disable{_all}
code paths.
Currently, software does not support a single PMU with mixed 48- and
64-bit counting, so Large increment event counts are limited to 48
bits. In set_period, we zero-out the upper 16 bits of the count, so
the hardware doesn't copy them to the even counter's higher bits.
Simple invocation example showing counting 8 FLOPs per 256-bit/%ymm
vaddps instruction executed in a loop 100 million times:
perf stat -e cpu/fp_ret_sse_avx_ops.all/,cpu/instructions/ <workload>
Performance counter stats for '<workload>':
800,000,000 cpu/fp_ret_sse_avx_ops.all/u
300,042,101 cpu/instructions/u
Prior to this patch, the reported SSE/AVX FLOPs retired count would
be wrong.
[peterz: lots of renames and edits to the code]
Signed-off-by: Kim Phillips <kim.phillips@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2019-11-15 02:37:20 +08:00
|
|
|
u64 used;
|
2011-11-18 19:35:21 +08:00
|
|
|
};
|
|
|
|
|
2011-11-18 19:35:22 +08:00
|
|
|
/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
|
|
|
|
#define SCHED_STATES_MAX 2
|
|
|
|
|
2011-11-18 19:35:21 +08:00
|
|
|
struct perf_sched {
|
|
|
|
int max_weight;
|
|
|
|
int max_events;
|
2015-05-21 16:57:17 +08:00
|
|
|
int max_gp;
|
|
|
|
int saved_states;
|
2015-05-21 16:57:13 +08:00
|
|
|
struct event_constraint **constraints;
|
2011-11-18 19:35:21 +08:00
|
|
|
struct sched_state state;
|
2011-11-18 19:35:22 +08:00
|
|
|
struct sched_state saved[SCHED_STATES_MAX];
|
2011-11-18 19:35:21 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
2021-03-18 22:28:01 +08:00
|
|
|
* Initialize iterator that runs through all events and counters.
|
2011-11-18 19:35:21 +08:00
|
|
|
*/
|
2015-05-21 16:57:13 +08:00
|
|
|
static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
|
2015-05-21 16:57:17 +08:00
|
|
|
int num, int wmin, int wmax, int gpmax)
|
2011-11-18 19:35:21 +08:00
|
|
|
{
|
|
|
|
int idx;
|
|
|
|
|
|
|
|
memset(sched, 0, sizeof(*sched));
|
|
|
|
sched->max_events = num;
|
|
|
|
sched->max_weight = wmax;
|
2015-05-21 16:57:17 +08:00
|
|
|
sched->max_gp = gpmax;
|
2015-05-21 16:57:13 +08:00
|
|
|
sched->constraints = constraints;
|
2011-11-18 19:35:21 +08:00
|
|
|
|
|
|
|
for (idx = 0; idx < num; idx++) {
|
2015-05-21 16:57:13 +08:00
|
|
|
if (constraints[idx]->weight == wmin)
|
2011-11-18 19:35:21 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
sched->state.event = idx; /* start with min weight */
|
|
|
|
sched->state.weight = wmin;
|
|
|
|
sched->state.unassigned = num;
|
|
|
|
}
|
|
|
|
|
2011-11-18 19:35:22 +08:00
|
|
|
static void perf_sched_save_state(struct perf_sched *sched)
|
|
|
|
{
|
|
|
|
if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
|
|
|
|
return;
|
|
|
|
|
|
|
|
sched->saved[sched->saved_states] = sched->state;
|
|
|
|
sched->saved_states++;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool perf_sched_restore_state(struct perf_sched *sched)
|
|
|
|
{
|
|
|
|
if (!sched->saved_states)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
sched->saved_states--;
|
|
|
|
sched->state = sched->saved[sched->saved_states];
|
|
|
|
|
perf/x86/amd: Add support for Large Increment per Cycle Events
Description of hardware operation
---------------------------------
The core AMD PMU has a 4-bit wide per-cycle increment for each
performance monitor counter. That works for most events, but
now with AMD Family 17h and above processors, some events can
occur more than 15 times in a cycle. Those events are called
"Large Increment per Cycle" events. In order to count these
events, two adjacent h/w PMCs get their count signals merged
to form 8 bits per cycle total. In addition, the PERF_CTR count
registers are merged to be able to count up to 64 bits.
Normally, events like instructions retired, get programmed on a single
counter like so:
PERF_CTL0 (MSR 0xc0010200) 0x000000000053ff0c # event 0x0c, umask 0xff
PERF_CTR0 (MSR 0xc0010201) 0x0000800000000001 # r/w 48-bit count
The next counter at MSRs 0xc0010202-3 remains unused, or can be used
independently to count something else.
When counting Large Increment per Cycle events, such as FLOPs,
however, we now have to reserve the next counter and program the
PERF_CTL (config) register with the Merge event (0xFFF), like so:
PERF_CTL0 (msr 0xc0010200) 0x000000000053ff03 # FLOPs event, umask 0xff
PERF_CTR0 (msr 0xc0010201) 0x0000800000000001 # rd 64-bit cnt, wr lo 48b
PERF_CTL1 (msr 0xc0010202) 0x0000000f004000ff # Merge event, enable bit
PERF_CTR1 (msr 0xc0010203) 0x0000000000000000 # wr hi 16-bits count
The count is widened from the normal 48-bits to 64 bits by having the
second counter carry the higher 16 bits of the count in its lower 16
bits of its counter register.
The odd counter, e.g., PERF_CTL1, is programmed with the enabled Merge
event before the even counter, PERF_CTL0.
The Large Increment feature is available starting with Family 17h.
For more details, search any Family 17h PPR for the "Large Increment
per Cycle Events" section, e.g., section 2.1.15.3 on p. 173 in this
version:
https://www.amd.com/system/files/TechDocs/56176_ppr_Family_17h_Model_71h_B0_pub_Rev_3.06.zip
Description of software operation
---------------------------------
The following steps are taken in order to support reserving and
enabling the extra counter for Large Increment per Cycle events:
1. In the main x86 scheduler, we reduce the number of available
counters by the number of Large Increment per Cycle events being
scheduled, tracked by a new cpuc variable 'n_pair' and a new
amd_put_event_constraints_f17h(). This improves the counter
scheduler success rate.
2. In perf_assign_events(), if a counter is assigned to a Large
Increment event, we increment the current counter variable, so the
counter used for the Merge event is removed from assignment
consideration by upcoming event assignments.
3. In find_counter(), if a counter has been found for the Large
Increment event, we set the next counter as used, to prevent other
events from using it.
4. We perform steps 2 & 3 also in the x86 scheduler fastpath, i.e.,
we add Merge event accounting to the existing used_mask logic.
5. Finally, we add on the programming of Merge event to the
neighbouring PMC counters in the counter enable/disable{_all}
code paths.
Currently, software does not support a single PMU with mixed 48- and
64-bit counting, so Large increment event counts are limited to 48
bits. In set_period, we zero-out the upper 16 bits of the count, so
the hardware doesn't copy them to the even counter's higher bits.
Simple invocation example showing counting 8 FLOPs per 256-bit/%ymm
vaddps instruction executed in a loop 100 million times:
perf stat -e cpu/fp_ret_sse_avx_ops.all/,cpu/instructions/ <workload>
Performance counter stats for '<workload>':
800,000,000 cpu/fp_ret_sse_avx_ops.all/u
300,042,101 cpu/instructions/u
Prior to this patch, the reported SSE/AVX FLOPs retired count would
be wrong.
[peterz: lots of renames and edits to the code]
Signed-off-by: Kim Phillips <kim.phillips@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2019-11-15 02:37:20 +08:00
|
|
|
/* this assignment didn't work out */
|
|
|
|
/* XXX broken vs EVENT_PAIR */
|
|
|
|
sched->state.used &= ~BIT_ULL(sched->state.counter);
|
|
|
|
|
|
|
|
/* try the next one */
|
|
|
|
sched->state.counter++;
|
2011-11-18 19:35:22 +08:00
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2011-11-18 19:35:21 +08:00
|
|
|
/*
|
|
|
|
* Select a counter for the current event to schedule. Return true on
|
|
|
|
* success.
|
|
|
|
*/
|
2011-11-18 19:35:22 +08:00
|
|
|
static bool __perf_sched_find_counter(struct perf_sched *sched)
|
2011-11-18 19:35:21 +08:00
|
|
|
{
|
|
|
|
struct event_constraint *c;
|
|
|
|
int idx;
|
|
|
|
|
|
|
|
if (!sched->state.unassigned)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (sched->state.event >= sched->max_events)
|
|
|
|
return false;
|
|
|
|
|
2015-05-21 16:57:13 +08:00
|
|
|
c = sched->constraints[sched->state.event];
|
2011-11-10 22:15:42 +08:00
|
|
|
/* Prefer fixed purpose counters */
|
2012-06-21 02:46:33 +08:00
|
|
|
if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
|
|
|
|
idx = INTEL_PMC_IDX_FIXED;
|
2012-03-24 06:02:03 +08:00
|
|
|
for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
|
perf/x86/amd: Add support for Large Increment per Cycle Events
Description of hardware operation
---------------------------------
The core AMD PMU has a 4-bit wide per-cycle increment for each
performance monitor counter. That works for most events, but
now with AMD Family 17h and above processors, some events can
occur more than 15 times in a cycle. Those events are called
"Large Increment per Cycle" events. In order to count these
events, two adjacent h/w PMCs get their count signals merged
to form 8 bits per cycle total. In addition, the PERF_CTR count
registers are merged to be able to count up to 64 bits.
Normally, events like instructions retired, get programmed on a single
counter like so:
PERF_CTL0 (MSR 0xc0010200) 0x000000000053ff0c # event 0x0c, umask 0xff
PERF_CTR0 (MSR 0xc0010201) 0x0000800000000001 # r/w 48-bit count
The next counter at MSRs 0xc0010202-3 remains unused, or can be used
independently to count something else.
When counting Large Increment per Cycle events, such as FLOPs,
however, we now have to reserve the next counter and program the
PERF_CTL (config) register with the Merge event (0xFFF), like so:
PERF_CTL0 (msr 0xc0010200) 0x000000000053ff03 # FLOPs event, umask 0xff
PERF_CTR0 (msr 0xc0010201) 0x0000800000000001 # rd 64-bit cnt, wr lo 48b
PERF_CTL1 (msr 0xc0010202) 0x0000000f004000ff # Merge event, enable bit
PERF_CTR1 (msr 0xc0010203) 0x0000000000000000 # wr hi 16-bits count
The count is widened from the normal 48-bits to 64 bits by having the
second counter carry the higher 16 bits of the count in its lower 16
bits of its counter register.
The odd counter, e.g., PERF_CTL1, is programmed with the enabled Merge
event before the even counter, PERF_CTL0.
The Large Increment feature is available starting with Family 17h.
For more details, search any Family 17h PPR for the "Large Increment
per Cycle Events" section, e.g., section 2.1.15.3 on p. 173 in this
version:
https://www.amd.com/system/files/TechDocs/56176_ppr_Family_17h_Model_71h_B0_pub_Rev_3.06.zip
Description of software operation
---------------------------------
The following steps are taken in order to support reserving and
enabling the extra counter for Large Increment per Cycle events:
1. In the main x86 scheduler, we reduce the number of available
counters by the number of Large Increment per Cycle events being
scheduled, tracked by a new cpuc variable 'n_pair' and a new
amd_put_event_constraints_f17h(). This improves the counter
scheduler success rate.
2. In perf_assign_events(), if a counter is assigned to a Large
Increment event, we increment the current counter variable, so the
counter used for the Merge event is removed from assignment
consideration by upcoming event assignments.
3. In find_counter(), if a counter has been found for the Large
Increment event, we set the next counter as used, to prevent other
events from using it.
4. We perform steps 2 & 3 also in the x86 scheduler fastpath, i.e.,
we add Merge event accounting to the existing used_mask logic.
5. Finally, we add on the programming of Merge event to the
neighbouring PMC counters in the counter enable/disable{_all}
code paths.
Currently, software does not support a single PMU with mixed 48- and
64-bit counting, so Large increment event counts are limited to 48
bits. In set_period, we zero-out the upper 16 bits of the count, so
the hardware doesn't copy them to the even counter's higher bits.
Simple invocation example showing counting 8 FLOPs per 256-bit/%ymm
vaddps instruction executed in a loop 100 million times:
perf stat -e cpu/fp_ret_sse_avx_ops.all/,cpu/instructions/ <workload>
Performance counter stats for '<workload>':
800,000,000 cpu/fp_ret_sse_avx_ops.all/u
300,042,101 cpu/instructions/u
Prior to this patch, the reported SSE/AVX FLOPs retired count would
be wrong.
[peterz: lots of renames and edits to the code]
Signed-off-by: Kim Phillips <kim.phillips@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2019-11-15 02:37:20 +08:00
|
|
|
u64 mask = BIT_ULL(idx);
|
|
|
|
|
|
|
|
if (sched->state.used & mask)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
sched->state.used |= mask;
|
|
|
|
goto done;
|
2011-11-10 22:15:42 +08:00
|
|
|
}
|
|
|
|
}
|
2015-05-21 16:57:17 +08:00
|
|
|
|
2011-11-18 19:35:21 +08:00
|
|
|
/* Grab the first unused counter starting with idx */
|
|
|
|
idx = sched->state.counter;
|
2012-06-21 02:46:33 +08:00
|
|
|
for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
|
perf/x86/amd: Add support for Large Increment per Cycle Events
Description of hardware operation
---------------------------------
The core AMD PMU has a 4-bit wide per-cycle increment for each
performance monitor counter. That works for most events, but
now with AMD Family 17h and above processors, some events can
occur more than 15 times in a cycle. Those events are called
"Large Increment per Cycle" events. In order to count these
events, two adjacent h/w PMCs get their count signals merged
to form 8 bits per cycle total. In addition, the PERF_CTR count
registers are merged to be able to count up to 64 bits.
Normally, events like instructions retired, get programmed on a single
counter like so:
PERF_CTL0 (MSR 0xc0010200) 0x000000000053ff0c # event 0x0c, umask 0xff
PERF_CTR0 (MSR 0xc0010201) 0x0000800000000001 # r/w 48-bit count
The next counter at MSRs 0xc0010202-3 remains unused, or can be used
independently to count something else.
When counting Large Increment per Cycle events, such as FLOPs,
however, we now have to reserve the next counter and program the
PERF_CTL (config) register with the Merge event (0xFFF), like so:
PERF_CTL0 (msr 0xc0010200) 0x000000000053ff03 # FLOPs event, umask 0xff
PERF_CTR0 (msr 0xc0010201) 0x0000800000000001 # rd 64-bit cnt, wr lo 48b
PERF_CTL1 (msr 0xc0010202) 0x0000000f004000ff # Merge event, enable bit
PERF_CTR1 (msr 0xc0010203) 0x0000000000000000 # wr hi 16-bits count
The count is widened from the normal 48-bits to 64 bits by having the
second counter carry the higher 16 bits of the count in its lower 16
bits of its counter register.
The odd counter, e.g., PERF_CTL1, is programmed with the enabled Merge
event before the even counter, PERF_CTL0.
The Large Increment feature is available starting with Family 17h.
For more details, search any Family 17h PPR for the "Large Increment
per Cycle Events" section, e.g., section 2.1.15.3 on p. 173 in this
version:
https://www.amd.com/system/files/TechDocs/56176_ppr_Family_17h_Model_71h_B0_pub_Rev_3.06.zip
Description of software operation
---------------------------------
The following steps are taken in order to support reserving and
enabling the extra counter for Large Increment per Cycle events:
1. In the main x86 scheduler, we reduce the number of available
counters by the number of Large Increment per Cycle events being
scheduled, tracked by a new cpuc variable 'n_pair' and a new
amd_put_event_constraints_f17h(). This improves the counter
scheduler success rate.
2. In perf_assign_events(), if a counter is assigned to a Large
Increment event, we increment the current counter variable, so the
counter used for the Merge event is removed from assignment
consideration by upcoming event assignments.
3. In find_counter(), if a counter has been found for the Large
Increment event, we set the next counter as used, to prevent other
events from using it.
4. We perform steps 2 & 3 also in the x86 scheduler fastpath, i.e.,
we add Merge event accounting to the existing used_mask logic.
5. Finally, we add on the programming of Merge event to the
neighbouring PMC counters in the counter enable/disable{_all}
code paths.
Currently, software does not support a single PMU with mixed 48- and
64-bit counting, so Large increment event counts are limited to 48
bits. In set_period, we zero-out the upper 16 bits of the count, so
the hardware doesn't copy them to the even counter's higher bits.
Simple invocation example showing counting 8 FLOPs per 256-bit/%ymm
vaddps instruction executed in a loop 100 million times:
perf stat -e cpu/fp_ret_sse_avx_ops.all/,cpu/instructions/ <workload>
Performance counter stats for '<workload>':
800,000,000 cpu/fp_ret_sse_avx_ops.all/u
300,042,101 cpu/instructions/u
Prior to this patch, the reported SSE/AVX FLOPs retired count would
be wrong.
[peterz: lots of renames and edits to the code]
Signed-off-by: Kim Phillips <kim.phillips@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2019-11-15 02:37:20 +08:00
|
|
|
u64 mask = BIT_ULL(idx);
|
2015-05-21 16:57:17 +08:00
|
|
|
|
perf/x86/amd: Add support for Large Increment per Cycle Events
Description of hardware operation
---------------------------------
The core AMD PMU has a 4-bit wide per-cycle increment for each
performance monitor counter. That works for most events, but
now with AMD Family 17h and above processors, some events can
occur more than 15 times in a cycle. Those events are called
"Large Increment per Cycle" events. In order to count these
events, two adjacent h/w PMCs get their count signals merged
to form 8 bits per cycle total. In addition, the PERF_CTR count
registers are merged to be able to count up to 64 bits.
Normally, events like instructions retired, get programmed on a single
counter like so:
PERF_CTL0 (MSR 0xc0010200) 0x000000000053ff0c # event 0x0c, umask 0xff
PERF_CTR0 (MSR 0xc0010201) 0x0000800000000001 # r/w 48-bit count
The next counter at MSRs 0xc0010202-3 remains unused, or can be used
independently to count something else.
When counting Large Increment per Cycle events, such as FLOPs,
however, we now have to reserve the next counter and program the
PERF_CTL (config) register with the Merge event (0xFFF), like so:
PERF_CTL0 (msr 0xc0010200) 0x000000000053ff03 # FLOPs event, umask 0xff
PERF_CTR0 (msr 0xc0010201) 0x0000800000000001 # rd 64-bit cnt, wr lo 48b
PERF_CTL1 (msr 0xc0010202) 0x0000000f004000ff # Merge event, enable bit
PERF_CTR1 (msr 0xc0010203) 0x0000000000000000 # wr hi 16-bits count
The count is widened from the normal 48-bits to 64 bits by having the
second counter carry the higher 16 bits of the count in its lower 16
bits of its counter register.
The odd counter, e.g., PERF_CTL1, is programmed with the enabled Merge
event before the even counter, PERF_CTL0.
The Large Increment feature is available starting with Family 17h.
For more details, search any Family 17h PPR for the "Large Increment
per Cycle Events" section, e.g., section 2.1.15.3 on p. 173 in this
version:
https://www.amd.com/system/files/TechDocs/56176_ppr_Family_17h_Model_71h_B0_pub_Rev_3.06.zip
Description of software operation
---------------------------------
The following steps are taken in order to support reserving and
enabling the extra counter for Large Increment per Cycle events:
1. In the main x86 scheduler, we reduce the number of available
counters by the number of Large Increment per Cycle events being
scheduled, tracked by a new cpuc variable 'n_pair' and a new
amd_put_event_constraints_f17h(). This improves the counter
scheduler success rate.
2. In perf_assign_events(), if a counter is assigned to a Large
Increment event, we increment the current counter variable, so the
counter used for the Merge event is removed from assignment
consideration by upcoming event assignments.
3. In find_counter(), if a counter has been found for the Large
Increment event, we set the next counter as used, to prevent other
events from using it.
4. We perform steps 2 & 3 also in the x86 scheduler fastpath, i.e.,
we add Merge event accounting to the existing used_mask logic.
5. Finally, we add on the programming of Merge event to the
neighbouring PMC counters in the counter enable/disable{_all}
code paths.
Currently, software does not support a single PMU with mixed 48- and
64-bit counting, so Large increment event counts are limited to 48
bits. In set_period, we zero-out the upper 16 bits of the count, so
the hardware doesn't copy them to the even counter's higher bits.
Simple invocation example showing counting 8 FLOPs per 256-bit/%ymm
vaddps instruction executed in a loop 100 million times:
perf stat -e cpu/fp_ret_sse_avx_ops.all/,cpu/instructions/ <workload>
Performance counter stats for '<workload>':
800,000,000 cpu/fp_ret_sse_avx_ops.all/u
300,042,101 cpu/instructions/u
Prior to this patch, the reported SSE/AVX FLOPs retired count would
be wrong.
[peterz: lots of renames and edits to the code]
Signed-off-by: Kim Phillips <kim.phillips@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2019-11-15 02:37:20 +08:00
|
|
|
if (c->flags & PERF_X86_EVENT_PAIR)
|
|
|
|
mask |= mask << 1;
|
|
|
|
|
|
|
|
if (sched->state.used & mask)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (sched->state.nr_gp++ >= sched->max_gp)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
sched->state.used |= mask;
|
|
|
|
goto done;
|
2011-11-18 19:35:21 +08:00
|
|
|
}
|
|
|
|
|
2011-11-10 22:15:42 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
done:
|
|
|
|
sched->state.counter = idx;
|
2011-11-18 19:35:21 +08:00
|
|
|
|
2011-11-18 19:35:22 +08:00
|
|
|
if (c->overlap)
|
|
|
|
perf_sched_save_state(sched);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool perf_sched_find_counter(struct perf_sched *sched)
|
|
|
|
{
|
|
|
|
while (!__perf_sched_find_counter(sched)) {
|
|
|
|
if (!perf_sched_restore_state(sched))
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2011-11-18 19:35:21 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Go through all unassigned events and find the next one to schedule.
|
|
|
|
* Take events with the least weight first. Return true on success.
|
|
|
|
*/
|
|
|
|
static bool perf_sched_next_event(struct perf_sched *sched)
|
|
|
|
{
|
|
|
|
struct event_constraint *c;
|
|
|
|
|
|
|
|
if (!sched->state.unassigned || !--sched->state.unassigned)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
do {
|
|
|
|
/* next event */
|
|
|
|
sched->state.event++;
|
|
|
|
if (sched->state.event >= sched->max_events) {
|
|
|
|
/* next weight */
|
|
|
|
sched->state.event = 0;
|
|
|
|
sched->state.weight++;
|
|
|
|
if (sched->state.weight > sched->max_weight)
|
|
|
|
return false;
|
|
|
|
}
|
2015-05-21 16:57:13 +08:00
|
|
|
c = sched->constraints[sched->state.event];
|
2011-11-18 19:35:21 +08:00
|
|
|
} while (c->weight != sched->state.weight);
|
|
|
|
|
|
|
|
sched->state.counter = 0; /* start with first counter */
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Assign a counter for each event.
|
|
|
|
*/
|
2015-05-21 16:57:13 +08:00
|
|
|
int perf_assign_events(struct event_constraint **constraints, int n,
|
2015-05-21 16:57:17 +08:00
|
|
|
int wmin, int wmax, int gpmax, int *assign)
|
2011-11-18 19:35:21 +08:00
|
|
|
{
|
|
|
|
struct perf_sched sched;
|
|
|
|
|
2015-05-21 16:57:17 +08:00
|
|
|
perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
|
2011-11-18 19:35:21 +08:00
|
|
|
|
|
|
|
do {
|
|
|
|
if (!perf_sched_find_counter(&sched))
|
|
|
|
break; /* failed */
|
|
|
|
if (assign)
|
|
|
|
assign[sched.state.event] = sched.state.counter;
|
|
|
|
} while (perf_sched_next_event(&sched));
|
|
|
|
|
|
|
|
return sched.state.unassigned;
|
|
|
|
}
|
2014-03-18 16:56:43 +08:00
|
|
|
EXPORT_SYMBOL_GPL(perf_assign_events);
|
2011-11-18 19:35:21 +08:00
|
|
|
|
2011-08-31 07:41:05 +08:00
|
|
|
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
|
2010-01-18 16:58:01 +08:00
|
|
|
{
|
2021-04-12 22:30:46 +08:00
|
|
|
int num_counters = hybrid(cpuc->pmu, num_counters);
|
2013-05-24 02:07:03 +08:00
|
|
|
struct event_constraint *c;
|
2013-06-21 00:42:54 +08:00
|
|
|
struct perf_event *e;
|
2019-03-14 20:25:02 +08:00
|
|
|
int n0, i, wmin, wmax, unsched = 0;
|
2010-01-18 16:58:01 +08:00
|
|
|
struct hw_perf_event *hwc;
|
perf/x86/amd: Add support for Large Increment per Cycle Events
Description of hardware operation
---------------------------------
The core AMD PMU has a 4-bit wide per-cycle increment for each
performance monitor counter. That works for most events, but
now with AMD Family 17h and above processors, some events can
occur more than 15 times in a cycle. Those events are called
"Large Increment per Cycle" events. In order to count these
events, two adjacent h/w PMCs get their count signals merged
to form 8 bits per cycle total. In addition, the PERF_CTR count
registers are merged to be able to count up to 64 bits.
Normally, events like instructions retired, get programmed on a single
counter like so:
PERF_CTL0 (MSR 0xc0010200) 0x000000000053ff0c # event 0x0c, umask 0xff
PERF_CTR0 (MSR 0xc0010201) 0x0000800000000001 # r/w 48-bit count
The next counter at MSRs 0xc0010202-3 remains unused, or can be used
independently to count something else.
When counting Large Increment per Cycle events, such as FLOPs,
however, we now have to reserve the next counter and program the
PERF_CTL (config) register with the Merge event (0xFFF), like so:
PERF_CTL0 (msr 0xc0010200) 0x000000000053ff03 # FLOPs event, umask 0xff
PERF_CTR0 (msr 0xc0010201) 0x0000800000000001 # rd 64-bit cnt, wr lo 48b
PERF_CTL1 (msr 0xc0010202) 0x0000000f004000ff # Merge event, enable bit
PERF_CTR1 (msr 0xc0010203) 0x0000000000000000 # wr hi 16-bits count
The count is widened from the normal 48-bits to 64 bits by having the
second counter carry the higher 16 bits of the count in its lower 16
bits of its counter register.
The odd counter, e.g., PERF_CTL1, is programmed with the enabled Merge
event before the even counter, PERF_CTL0.
The Large Increment feature is available starting with Family 17h.
For more details, search any Family 17h PPR for the "Large Increment
per Cycle Events" section, e.g., section 2.1.15.3 on p. 173 in this
version:
https://www.amd.com/system/files/TechDocs/56176_ppr_Family_17h_Model_71h_B0_pub_Rev_3.06.zip
Description of software operation
---------------------------------
The following steps are taken in order to support reserving and
enabling the extra counter for Large Increment per Cycle events:
1. In the main x86 scheduler, we reduce the number of available
counters by the number of Large Increment per Cycle events being
scheduled, tracked by a new cpuc variable 'n_pair' and a new
amd_put_event_constraints_f17h(). This improves the counter
scheduler success rate.
2. In perf_assign_events(), if a counter is assigned to a Large
Increment event, we increment the current counter variable, so the
counter used for the Merge event is removed from assignment
consideration by upcoming event assignments.
3. In find_counter(), if a counter has been found for the Large
Increment event, we set the next counter as used, to prevent other
events from using it.
4. We perform steps 2 & 3 also in the x86 scheduler fastpath, i.e.,
we add Merge event accounting to the existing used_mask logic.
5. Finally, we add on the programming of Merge event to the
neighbouring PMC counters in the counter enable/disable{_all}
code paths.
Currently, software does not support a single PMU with mixed 48- and
64-bit counting, so Large increment event counts are limited to 48
bits. In set_period, we zero-out the upper 16 bits of the count, so
the hardware doesn't copy them to the even counter's higher bits.
Simple invocation example showing counting 8 FLOPs per 256-bit/%ymm
vaddps instruction executed in a loop 100 million times:
perf stat -e cpu/fp_ret_sse_avx_ops.all/,cpu/instructions/ <workload>
Performance counter stats for '<workload>':
800,000,000 cpu/fp_ret_sse_avx_ops.all/u
300,042,101 cpu/instructions/u
Prior to this patch, the reported SSE/AVX FLOPs retired count would
be wrong.
[peterz: lots of renames and edits to the code]
Signed-off-by: Kim Phillips <kim.phillips@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2019-11-15 02:37:20 +08:00
|
|
|
u64 used_mask = 0;
|
2010-01-18 16:58:01 +08:00
|
|
|
|
2019-03-14 20:25:02 +08:00
|
|
|
/*
|
|
|
|
* Compute the number of events already present; see x86_pmu_add(),
|
|
|
|
* validate_group() and x86_pmu_commit_txn(). For the former two
|
|
|
|
* cpuc->n_events hasn't been updated yet, while for the latter
|
|
|
|
* cpuc->n_txn contains the number of events added in the current
|
|
|
|
* transaction.
|
|
|
|
*/
|
|
|
|
n0 = cpuc->n_events;
|
|
|
|
if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
|
|
|
|
n0 -= cpuc->n_txn;
|
|
|
|
|
2020-08-18 21:57:53 +08:00
|
|
|
static_call_cond(x86_pmu_start_scheduling)(cpuc);
|
2014-11-18 03:06:55 +08:00
|
|
|
|
2011-11-18 19:35:21 +08:00
|
|
|
for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
|
2019-03-14 20:17:51 +08:00
|
|
|
c = cpuc->event_constraint[i];
|
|
|
|
|
2019-03-14 20:25:02 +08:00
|
|
|
/*
|
|
|
|
* Previously scheduled events should have a cached constraint,
|
|
|
|
* while new events should not have one.
|
|
|
|
*/
|
|
|
|
WARN_ON_ONCE((c && i >= n0) || (!c && i < n0));
|
|
|
|
|
2019-03-14 20:17:51 +08:00
|
|
|
/*
|
|
|
|
* Request constraints for new events; or for those events that
|
|
|
|
* have a dynamic constraint -- for those the constraint can
|
|
|
|
* change due to external factors (sibling state, allow_tfa).
|
|
|
|
*/
|
|
|
|
if (!c || (c->flags & PERF_X86_EVENT_DYNAMIC)) {
|
2020-08-18 21:57:53 +08:00
|
|
|
c = static_call(x86_pmu_get_event_constraints)(cpuc, i, cpuc->event_list[i]);
|
2019-03-14 20:17:51 +08:00
|
|
|
cpuc->event_constraint[i] = c;
|
|
|
|
}
|
2013-05-24 02:07:03 +08:00
|
|
|
|
2011-11-18 19:35:21 +08:00
|
|
|
wmin = min(wmin, c->weight);
|
|
|
|
wmax = max(wmax, c->weight);
|
2010-01-18 16:58:01 +08:00
|
|
|
}
|
|
|
|
|
2010-01-21 23:39:01 +08:00
|
|
|
/*
|
|
|
|
* fastpath, try to reuse previous register
|
|
|
|
*/
|
2010-01-22 23:40:12 +08:00
|
|
|
for (i = 0; i < n; i++) {
|
perf/x86/amd: Add support for Large Increment per Cycle Events
Description of hardware operation
---------------------------------
The core AMD PMU has a 4-bit wide per-cycle increment for each
performance monitor counter. That works for most events, but
now with AMD Family 17h and above processors, some events can
occur more than 15 times in a cycle. Those events are called
"Large Increment per Cycle" events. In order to count these
events, two adjacent h/w PMCs get their count signals merged
to form 8 bits per cycle total. In addition, the PERF_CTR count
registers are merged to be able to count up to 64 bits.
Normally, events like instructions retired, get programmed on a single
counter like so:
PERF_CTL0 (MSR 0xc0010200) 0x000000000053ff0c # event 0x0c, umask 0xff
PERF_CTR0 (MSR 0xc0010201) 0x0000800000000001 # r/w 48-bit count
The next counter at MSRs 0xc0010202-3 remains unused, or can be used
independently to count something else.
When counting Large Increment per Cycle events, such as FLOPs,
however, we now have to reserve the next counter and program the
PERF_CTL (config) register with the Merge event (0xFFF), like so:
PERF_CTL0 (msr 0xc0010200) 0x000000000053ff03 # FLOPs event, umask 0xff
PERF_CTR0 (msr 0xc0010201) 0x0000800000000001 # rd 64-bit cnt, wr lo 48b
PERF_CTL1 (msr 0xc0010202) 0x0000000f004000ff # Merge event, enable bit
PERF_CTR1 (msr 0xc0010203) 0x0000000000000000 # wr hi 16-bits count
The count is widened from the normal 48-bits to 64 bits by having the
second counter carry the higher 16 bits of the count in its lower 16
bits of its counter register.
The odd counter, e.g., PERF_CTL1, is programmed with the enabled Merge
event before the even counter, PERF_CTL0.
The Large Increment feature is available starting with Family 17h.
For more details, search any Family 17h PPR for the "Large Increment
per Cycle Events" section, e.g., section 2.1.15.3 on p. 173 in this
version:
https://www.amd.com/system/files/TechDocs/56176_ppr_Family_17h_Model_71h_B0_pub_Rev_3.06.zip
Description of software operation
---------------------------------
The following steps are taken in order to support reserving and
enabling the extra counter for Large Increment per Cycle events:
1. In the main x86 scheduler, we reduce the number of available
counters by the number of Large Increment per Cycle events being
scheduled, tracked by a new cpuc variable 'n_pair' and a new
amd_put_event_constraints_f17h(). This improves the counter
scheduler success rate.
2. In perf_assign_events(), if a counter is assigned to a Large
Increment event, we increment the current counter variable, so the
counter used for the Merge event is removed from assignment
consideration by upcoming event assignments.
3. In find_counter(), if a counter has been found for the Large
Increment event, we set the next counter as used, to prevent other
events from using it.
4. We perform steps 2 & 3 also in the x86 scheduler fastpath, i.e.,
we add Merge event accounting to the existing used_mask logic.
5. Finally, we add on the programming of Merge event to the
neighbouring PMC counters in the counter enable/disable{_all}
code paths.
Currently, software does not support a single PMU with mixed 48- and
64-bit counting, so Large increment event counts are limited to 48
bits. In set_period, we zero-out the upper 16 bits of the count, so
the hardware doesn't copy them to the even counter's higher bits.
Simple invocation example showing counting 8 FLOPs per 256-bit/%ymm
vaddps instruction executed in a loop 100 million times:
perf stat -e cpu/fp_ret_sse_avx_ops.all/,cpu/instructions/ <workload>
Performance counter stats for '<workload>':
800,000,000 cpu/fp_ret_sse_avx_ops.all/u
300,042,101 cpu/instructions/u
Prior to this patch, the reported SSE/AVX FLOPs retired count would
be wrong.
[peterz: lots of renames and edits to the code]
Signed-off-by: Kim Phillips <kim.phillips@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2019-11-15 02:37:20 +08:00
|
|
|
u64 mask;
|
|
|
|
|
2010-01-21 23:39:01 +08:00
|
|
|
hwc = &cpuc->event_list[i]->hw;
|
2015-05-21 16:57:13 +08:00
|
|
|
c = cpuc->event_constraint[i];
|
2010-01-21 23:39:01 +08:00
|
|
|
|
|
|
|
/* never assigned */
|
|
|
|
if (hwc->idx == -1)
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* constraint still honored */
|
2010-01-22 23:32:17 +08:00
|
|
|
if (!test_bit(hwc->idx, c->idxmsk))
|
2010-01-21 23:39:01 +08:00
|
|
|
break;
|
|
|
|
|
perf/x86/amd: Add support for Large Increment per Cycle Events
Description of hardware operation
---------------------------------
The core AMD PMU has a 4-bit wide per-cycle increment for each
performance monitor counter. That works for most events, but
now with AMD Family 17h and above processors, some events can
occur more than 15 times in a cycle. Those events are called
"Large Increment per Cycle" events. In order to count these
events, two adjacent h/w PMCs get their count signals merged
to form 8 bits per cycle total. In addition, the PERF_CTR count
registers are merged to be able to count up to 64 bits.
Normally, events like instructions retired, get programmed on a single
counter like so:
PERF_CTL0 (MSR 0xc0010200) 0x000000000053ff0c # event 0x0c, umask 0xff
PERF_CTR0 (MSR 0xc0010201) 0x0000800000000001 # r/w 48-bit count
The next counter at MSRs 0xc0010202-3 remains unused, or can be used
independently to count something else.
When counting Large Increment per Cycle events, such as FLOPs,
however, we now have to reserve the next counter and program the
PERF_CTL (config) register with the Merge event (0xFFF), like so:
PERF_CTL0 (msr 0xc0010200) 0x000000000053ff03 # FLOPs event, umask 0xff
PERF_CTR0 (msr 0xc0010201) 0x0000800000000001 # rd 64-bit cnt, wr lo 48b
PERF_CTL1 (msr 0xc0010202) 0x0000000f004000ff # Merge event, enable bit
PERF_CTR1 (msr 0xc0010203) 0x0000000000000000 # wr hi 16-bits count
The count is widened from the normal 48-bits to 64 bits by having the
second counter carry the higher 16 bits of the count in its lower 16
bits of its counter register.
The odd counter, e.g., PERF_CTL1, is programmed with the enabled Merge
event before the even counter, PERF_CTL0.
The Large Increment feature is available starting with Family 17h.
For more details, search any Family 17h PPR for the "Large Increment
per Cycle Events" section, e.g., section 2.1.15.3 on p. 173 in this
version:
https://www.amd.com/system/files/TechDocs/56176_ppr_Family_17h_Model_71h_B0_pub_Rev_3.06.zip
Description of software operation
---------------------------------
The following steps are taken in order to support reserving and
enabling the extra counter for Large Increment per Cycle events:
1. In the main x86 scheduler, we reduce the number of available
counters by the number of Large Increment per Cycle events being
scheduled, tracked by a new cpuc variable 'n_pair' and a new
amd_put_event_constraints_f17h(). This improves the counter
scheduler success rate.
2. In perf_assign_events(), if a counter is assigned to a Large
Increment event, we increment the current counter variable, so the
counter used for the Merge event is removed from assignment
consideration by upcoming event assignments.
3. In find_counter(), if a counter has been found for the Large
Increment event, we set the next counter as used, to prevent other
events from using it.
4. We perform steps 2 & 3 also in the x86 scheduler fastpath, i.e.,
we add Merge event accounting to the existing used_mask logic.
5. Finally, we add on the programming of Merge event to the
neighbouring PMC counters in the counter enable/disable{_all}
code paths.
Currently, software does not support a single PMU with mixed 48- and
64-bit counting, so Large increment event counts are limited to 48
bits. In set_period, we zero-out the upper 16 bits of the count, so
the hardware doesn't copy them to the even counter's higher bits.
Simple invocation example showing counting 8 FLOPs per 256-bit/%ymm
vaddps instruction executed in a loop 100 million times:
perf stat -e cpu/fp_ret_sse_avx_ops.all/,cpu/instructions/ <workload>
Performance counter stats for '<workload>':
800,000,000 cpu/fp_ret_sse_avx_ops.all/u
300,042,101 cpu/instructions/u
Prior to this patch, the reported SSE/AVX FLOPs retired count would
be wrong.
[peterz: lots of renames and edits to the code]
Signed-off-by: Kim Phillips <kim.phillips@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2019-11-15 02:37:20 +08:00
|
|
|
mask = BIT_ULL(hwc->idx);
|
|
|
|
if (is_counter_pair(hwc))
|
|
|
|
mask |= mask << 1;
|
|
|
|
|
2010-01-21 23:39:01 +08:00
|
|
|
/* not already used */
|
perf/x86/amd: Add support for Large Increment per Cycle Events
Description of hardware operation
---------------------------------
The core AMD PMU has a 4-bit wide per-cycle increment for each
performance monitor counter. That works for most events, but
now with AMD Family 17h and above processors, some events can
occur more than 15 times in a cycle. Those events are called
"Large Increment per Cycle" events. In order to count these
events, two adjacent h/w PMCs get their count signals merged
to form 8 bits per cycle total. In addition, the PERF_CTR count
registers are merged to be able to count up to 64 bits.
Normally, events like instructions retired, get programmed on a single
counter like so:
PERF_CTL0 (MSR 0xc0010200) 0x000000000053ff0c # event 0x0c, umask 0xff
PERF_CTR0 (MSR 0xc0010201) 0x0000800000000001 # r/w 48-bit count
The next counter at MSRs 0xc0010202-3 remains unused, or can be used
independently to count something else.
When counting Large Increment per Cycle events, such as FLOPs,
however, we now have to reserve the next counter and program the
PERF_CTL (config) register with the Merge event (0xFFF), like so:
PERF_CTL0 (msr 0xc0010200) 0x000000000053ff03 # FLOPs event, umask 0xff
PERF_CTR0 (msr 0xc0010201) 0x0000800000000001 # rd 64-bit cnt, wr lo 48b
PERF_CTL1 (msr 0xc0010202) 0x0000000f004000ff # Merge event, enable bit
PERF_CTR1 (msr 0xc0010203) 0x0000000000000000 # wr hi 16-bits count
The count is widened from the normal 48-bits to 64 bits by having the
second counter carry the higher 16 bits of the count in its lower 16
bits of its counter register.
The odd counter, e.g., PERF_CTL1, is programmed with the enabled Merge
event before the even counter, PERF_CTL0.
The Large Increment feature is available starting with Family 17h.
For more details, search any Family 17h PPR for the "Large Increment
per Cycle Events" section, e.g., section 2.1.15.3 on p. 173 in this
version:
https://www.amd.com/system/files/TechDocs/56176_ppr_Family_17h_Model_71h_B0_pub_Rev_3.06.zip
Description of software operation
---------------------------------
The following steps are taken in order to support reserving and
enabling the extra counter for Large Increment per Cycle events:
1. In the main x86 scheduler, we reduce the number of available
counters by the number of Large Increment per Cycle events being
scheduled, tracked by a new cpuc variable 'n_pair' and a new
amd_put_event_constraints_f17h(). This improves the counter
scheduler success rate.
2. In perf_assign_events(), if a counter is assigned to a Large
Increment event, we increment the current counter variable, so the
counter used for the Merge event is removed from assignment
consideration by upcoming event assignments.
3. In find_counter(), if a counter has been found for the Large
Increment event, we set the next counter as used, to prevent other
events from using it.
4. We perform steps 2 & 3 also in the x86 scheduler fastpath, i.e.,
we add Merge event accounting to the existing used_mask logic.
5. Finally, we add on the programming of Merge event to the
neighbouring PMC counters in the counter enable/disable{_all}
code paths.
Currently, software does not support a single PMU with mixed 48- and
64-bit counting, so Large increment event counts are limited to 48
bits. In set_period, we zero-out the upper 16 bits of the count, so
the hardware doesn't copy them to the even counter's higher bits.
Simple invocation example showing counting 8 FLOPs per 256-bit/%ymm
vaddps instruction executed in a loop 100 million times:
perf stat -e cpu/fp_ret_sse_avx_ops.all/,cpu/instructions/ <workload>
Performance counter stats for '<workload>':
800,000,000 cpu/fp_ret_sse_avx_ops.all/u
300,042,101 cpu/instructions/u
Prior to this patch, the reported SSE/AVX FLOPs retired count would
be wrong.
[peterz: lots of renames and edits to the code]
Signed-off-by: Kim Phillips <kim.phillips@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2019-11-15 02:37:20 +08:00
|
|
|
if (used_mask & mask)
|
2010-01-21 23:39:01 +08:00
|
|
|
break;
|
|
|
|
|
perf/x86/amd: Add support for Large Increment per Cycle Events
Description of hardware operation
---------------------------------
The core AMD PMU has a 4-bit wide per-cycle increment for each
performance monitor counter. That works for most events, but
now with AMD Family 17h and above processors, some events can
occur more than 15 times in a cycle. Those events are called
"Large Increment per Cycle" events. In order to count these
events, two adjacent h/w PMCs get their count signals merged
to form 8 bits per cycle total. In addition, the PERF_CTR count
registers are merged to be able to count up to 64 bits.
Normally, events like instructions retired, get programmed on a single
counter like so:
PERF_CTL0 (MSR 0xc0010200) 0x000000000053ff0c # event 0x0c, umask 0xff
PERF_CTR0 (MSR 0xc0010201) 0x0000800000000001 # r/w 48-bit count
The next counter at MSRs 0xc0010202-3 remains unused, or can be used
independently to count something else.
When counting Large Increment per Cycle events, such as FLOPs,
however, we now have to reserve the next counter and program the
PERF_CTL (config) register with the Merge event (0xFFF), like so:
PERF_CTL0 (msr 0xc0010200) 0x000000000053ff03 # FLOPs event, umask 0xff
PERF_CTR0 (msr 0xc0010201) 0x0000800000000001 # rd 64-bit cnt, wr lo 48b
PERF_CTL1 (msr 0xc0010202) 0x0000000f004000ff # Merge event, enable bit
PERF_CTR1 (msr 0xc0010203) 0x0000000000000000 # wr hi 16-bits count
The count is widened from the normal 48-bits to 64 bits by having the
second counter carry the higher 16 bits of the count in its lower 16
bits of its counter register.
The odd counter, e.g., PERF_CTL1, is programmed with the enabled Merge
event before the even counter, PERF_CTL0.
The Large Increment feature is available starting with Family 17h.
For more details, search any Family 17h PPR for the "Large Increment
per Cycle Events" section, e.g., section 2.1.15.3 on p. 173 in this
version:
https://www.amd.com/system/files/TechDocs/56176_ppr_Family_17h_Model_71h_B0_pub_Rev_3.06.zip
Description of software operation
---------------------------------
The following steps are taken in order to support reserving and
enabling the extra counter for Large Increment per Cycle events:
1. In the main x86 scheduler, we reduce the number of available
counters by the number of Large Increment per Cycle events being
scheduled, tracked by a new cpuc variable 'n_pair' and a new
amd_put_event_constraints_f17h(). This improves the counter
scheduler success rate.
2. In perf_assign_events(), if a counter is assigned to a Large
Increment event, we increment the current counter variable, so the
counter used for the Merge event is removed from assignment
consideration by upcoming event assignments.
3. In find_counter(), if a counter has been found for the Large
Increment event, we set the next counter as used, to prevent other
events from using it.
4. We perform steps 2 & 3 also in the x86 scheduler fastpath, i.e.,
we add Merge event accounting to the existing used_mask logic.
5. Finally, we add on the programming of Merge event to the
neighbouring PMC counters in the counter enable/disable{_all}
code paths.
Currently, software does not support a single PMU with mixed 48- and
64-bit counting, so Large increment event counts are limited to 48
bits. In set_period, we zero-out the upper 16 bits of the count, so
the hardware doesn't copy them to the even counter's higher bits.
Simple invocation example showing counting 8 FLOPs per 256-bit/%ymm
vaddps instruction executed in a loop 100 million times:
perf stat -e cpu/fp_ret_sse_avx_ops.all/,cpu/instructions/ <workload>
Performance counter stats for '<workload>':
800,000,000 cpu/fp_ret_sse_avx_ops.all/u
300,042,101 cpu/instructions/u
Prior to this patch, the reported SSE/AVX FLOPs retired count would
be wrong.
[peterz: lots of renames and edits to the code]
Signed-off-by: Kim Phillips <kim.phillips@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2019-11-15 02:37:20 +08:00
|
|
|
used_mask |= mask;
|
|
|
|
|
2010-01-21 23:39:01 +08:00
|
|
|
if (assign)
|
|
|
|
assign[i] = hwc->idx;
|
|
|
|
}
|
|
|
|
|
2011-11-18 19:35:21 +08:00
|
|
|
/* slow path */
|
2015-05-21 16:57:13 +08:00
|
|
|
if (i != n) {
|
2021-04-12 22:30:46 +08:00
|
|
|
int gpmax = num_counters;
|
2015-05-21 16:57:17 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Do not allow scheduling of more than half the available
|
|
|
|
* generic counters.
|
|
|
|
*
|
|
|
|
* This helps avoid counter starvation of sibling thread by
|
|
|
|
* ensuring at most half the counters cannot be in exclusive
|
|
|
|
* mode. There is no designated counters for the limits. Any
|
|
|
|
* N/2 counters can be used. This helps with events with
|
|
|
|
* specific counter constraints.
|
|
|
|
*/
|
|
|
|
if (is_ht_workaround_enabled() && !cpuc->is_fake &&
|
|
|
|
READ_ONCE(cpuc->excl_cntrs->exclusive_present))
|
|
|
|
gpmax /= 2;
|
|
|
|
|
perf/x86/amd: Add support for Large Increment per Cycle Events
Description of hardware operation
---------------------------------
The core AMD PMU has a 4-bit wide per-cycle increment for each
performance monitor counter. That works for most events, but
now with AMD Family 17h and above processors, some events can
occur more than 15 times in a cycle. Those events are called
"Large Increment per Cycle" events. In order to count these
events, two adjacent h/w PMCs get their count signals merged
to form 8 bits per cycle total. In addition, the PERF_CTR count
registers are merged to be able to count up to 64 bits.
Normally, events like instructions retired, get programmed on a single
counter like so:
PERF_CTL0 (MSR 0xc0010200) 0x000000000053ff0c # event 0x0c, umask 0xff
PERF_CTR0 (MSR 0xc0010201) 0x0000800000000001 # r/w 48-bit count
The next counter at MSRs 0xc0010202-3 remains unused, or can be used
independently to count something else.
When counting Large Increment per Cycle events, such as FLOPs,
however, we now have to reserve the next counter and program the
PERF_CTL (config) register with the Merge event (0xFFF), like so:
PERF_CTL0 (msr 0xc0010200) 0x000000000053ff03 # FLOPs event, umask 0xff
PERF_CTR0 (msr 0xc0010201) 0x0000800000000001 # rd 64-bit cnt, wr lo 48b
PERF_CTL1 (msr 0xc0010202) 0x0000000f004000ff # Merge event, enable bit
PERF_CTR1 (msr 0xc0010203) 0x0000000000000000 # wr hi 16-bits count
The count is widened from the normal 48-bits to 64 bits by having the
second counter carry the higher 16 bits of the count in its lower 16
bits of its counter register.
The odd counter, e.g., PERF_CTL1, is programmed with the enabled Merge
event before the even counter, PERF_CTL0.
The Large Increment feature is available starting with Family 17h.
For more details, search any Family 17h PPR for the "Large Increment
per Cycle Events" section, e.g., section 2.1.15.3 on p. 173 in this
version:
https://www.amd.com/system/files/TechDocs/56176_ppr_Family_17h_Model_71h_B0_pub_Rev_3.06.zip
Description of software operation
---------------------------------
The following steps are taken in order to support reserving and
enabling the extra counter for Large Increment per Cycle events:
1. In the main x86 scheduler, we reduce the number of available
counters by the number of Large Increment per Cycle events being
scheduled, tracked by a new cpuc variable 'n_pair' and a new
amd_put_event_constraints_f17h(). This improves the counter
scheduler success rate.
2. In perf_assign_events(), if a counter is assigned to a Large
Increment event, we increment the current counter variable, so the
counter used for the Merge event is removed from assignment
consideration by upcoming event assignments.
3. In find_counter(), if a counter has been found for the Large
Increment event, we set the next counter as used, to prevent other
events from using it.
4. We perform steps 2 & 3 also in the x86 scheduler fastpath, i.e.,
we add Merge event accounting to the existing used_mask logic.
5. Finally, we add on the programming of Merge event to the
neighbouring PMC counters in the counter enable/disable{_all}
code paths.
Currently, software does not support a single PMU with mixed 48- and
64-bit counting, so Large increment event counts are limited to 48
bits. In set_period, we zero-out the upper 16 bits of the count, so
the hardware doesn't copy them to the even counter's higher bits.
Simple invocation example showing counting 8 FLOPs per 256-bit/%ymm
vaddps instruction executed in a loop 100 million times:
perf stat -e cpu/fp_ret_sse_avx_ops.all/,cpu/instructions/ <workload>
Performance counter stats for '<workload>':
800,000,000 cpu/fp_ret_sse_avx_ops.all/u
300,042,101 cpu/instructions/u
Prior to this patch, the reported SSE/AVX FLOPs retired count would
be wrong.
[peterz: lots of renames and edits to the code]
Signed-off-by: Kim Phillips <kim.phillips@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2019-11-15 02:37:20 +08:00
|
|
|
/*
|
|
|
|
* Reduce the amount of available counters to allow fitting
|
|
|
|
* the extra Merge events needed by large increment events.
|
|
|
|
*/
|
|
|
|
if (x86_pmu.flags & PMU_FL_PAIR) {
|
2021-04-12 22:30:46 +08:00
|
|
|
gpmax = num_counters - cpuc->n_pair;
|
perf/x86/amd: Add support for Large Increment per Cycle Events
Description of hardware operation
---------------------------------
The core AMD PMU has a 4-bit wide per-cycle increment for each
performance monitor counter. That works for most events, but
now with AMD Family 17h and above processors, some events can
occur more than 15 times in a cycle. Those events are called
"Large Increment per Cycle" events. In order to count these
events, two adjacent h/w PMCs get their count signals merged
to form 8 bits per cycle total. In addition, the PERF_CTR count
registers are merged to be able to count up to 64 bits.
Normally, events like instructions retired, get programmed on a single
counter like so:
PERF_CTL0 (MSR 0xc0010200) 0x000000000053ff0c # event 0x0c, umask 0xff
PERF_CTR0 (MSR 0xc0010201) 0x0000800000000001 # r/w 48-bit count
The next counter at MSRs 0xc0010202-3 remains unused, or can be used
independently to count something else.
When counting Large Increment per Cycle events, such as FLOPs,
however, we now have to reserve the next counter and program the
PERF_CTL (config) register with the Merge event (0xFFF), like so:
PERF_CTL0 (msr 0xc0010200) 0x000000000053ff03 # FLOPs event, umask 0xff
PERF_CTR0 (msr 0xc0010201) 0x0000800000000001 # rd 64-bit cnt, wr lo 48b
PERF_CTL1 (msr 0xc0010202) 0x0000000f004000ff # Merge event, enable bit
PERF_CTR1 (msr 0xc0010203) 0x0000000000000000 # wr hi 16-bits count
The count is widened from the normal 48-bits to 64 bits by having the
second counter carry the higher 16 bits of the count in its lower 16
bits of its counter register.
The odd counter, e.g., PERF_CTL1, is programmed with the enabled Merge
event before the even counter, PERF_CTL0.
The Large Increment feature is available starting with Family 17h.
For more details, search any Family 17h PPR for the "Large Increment
per Cycle Events" section, e.g., section 2.1.15.3 on p. 173 in this
version:
https://www.amd.com/system/files/TechDocs/56176_ppr_Family_17h_Model_71h_B0_pub_Rev_3.06.zip
Description of software operation
---------------------------------
The following steps are taken in order to support reserving and
enabling the extra counter for Large Increment per Cycle events:
1. In the main x86 scheduler, we reduce the number of available
counters by the number of Large Increment per Cycle events being
scheduled, tracked by a new cpuc variable 'n_pair' and a new
amd_put_event_constraints_f17h(). This improves the counter
scheduler success rate.
2. In perf_assign_events(), if a counter is assigned to a Large
Increment event, we increment the current counter variable, so the
counter used for the Merge event is removed from assignment
consideration by upcoming event assignments.
3. In find_counter(), if a counter has been found for the Large
Increment event, we set the next counter as used, to prevent other
events from using it.
4. We perform steps 2 & 3 also in the x86 scheduler fastpath, i.e.,
we add Merge event accounting to the existing used_mask logic.
5. Finally, we add on the programming of Merge event to the
neighbouring PMC counters in the counter enable/disable{_all}
code paths.
Currently, software does not support a single PMU with mixed 48- and
64-bit counting, so Large increment event counts are limited to 48
bits. In set_period, we zero-out the upper 16 bits of the count, so
the hardware doesn't copy them to the even counter's higher bits.
Simple invocation example showing counting 8 FLOPs per 256-bit/%ymm
vaddps instruction executed in a loop 100 million times:
perf stat -e cpu/fp_ret_sse_avx_ops.all/,cpu/instructions/ <workload>
Performance counter stats for '<workload>':
800,000,000 cpu/fp_ret_sse_avx_ops.all/u
300,042,101 cpu/instructions/u
Prior to this patch, the reported SSE/AVX FLOPs retired count would
be wrong.
[peterz: lots of renames and edits to the code]
Signed-off-by: Kim Phillips <kim.phillips@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2019-11-15 02:37:20 +08:00
|
|
|
WARN_ON(gpmax <= 0);
|
|
|
|
}
|
|
|
|
|
2015-05-21 16:57:13 +08:00
|
|
|
unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
|
2015-05-21 16:57:17 +08:00
|
|
|
wmax, gpmax, assign);
|
2015-05-21 16:57:13 +08:00
|
|
|
}
|
2010-01-21 23:39:01 +08:00
|
|
|
|
2013-06-21 00:42:54 +08:00
|
|
|
/*
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
* In case of success (unsched = 0), mark events as committed,
|
|
|
|
* so we do not put_constraint() in case new events are added
|
|
|
|
* and fail to be scheduled
|
|
|
|
*
|
|
|
|
* We invoke the lower level commit callback to lock the resource
|
|
|
|
*
|
|
|
|
* We do not need to do all of this in case we are called to
|
|
|
|
* validate an event group (assign == NULL)
|
2013-06-21 00:42:54 +08:00
|
|
|
*/
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
if (!unsched && assign) {
|
2021-08-04 19:57:10 +08:00
|
|
|
for (i = 0; i < n; i++)
|
2020-08-18 21:57:53 +08:00
|
|
|
static_call_cond(x86_pmu_commit_scheduling)(cpuc, i, assign[i]);
|
2015-05-21 16:57:43 +08:00
|
|
|
} else {
|
2019-03-14 20:25:02 +08:00
|
|
|
for (i = n0; i < n; i++) {
|
2013-06-21 00:42:54 +08:00
|
|
|
e = cpuc->event_list[i];
|
|
|
|
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
/*
|
|
|
|
* release events that failed scheduling
|
|
|
|
*/
|
2020-08-18 21:57:53 +08:00
|
|
|
static_call_cond(x86_pmu_put_event_constraints)(cpuc, e);
|
2019-03-14 20:03:00 +08:00
|
|
|
|
|
|
|
cpuc->event_constraint[i] = NULL;
|
2010-01-18 16:58:01 +08:00
|
|
|
}
|
|
|
|
}
|
2014-11-18 03:06:55 +08:00
|
|
|
|
2020-08-18 21:57:53 +08:00
|
|
|
static_call_cond(x86_pmu_stop_scheduling)(cpuc);
|
2014-11-18 03:06:55 +08:00
|
|
|
|
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:
- SandyBridge: BJ122
- IvyBridge: BV98
- Haswell: HSD29
The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.
There is no HW or FW workaround for this problem.
The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
139 277 291 r20cc
10,000969126 seconds time elapsed
In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.
This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.
The same example run with the workaround enabled, yield the correct answer:
$ taskset -c 0 triad &
$ taskset -c 4 triad &
$ perf stat -a -C 0 -e r81d0 sleep 100 &
$ perf stat -a -C 4 -r20cc sleep 10
Performance counter stats for 'system wide':
0 r20cc
10,000969126 seconds time elapsed
The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.
The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.
As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.
Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.
This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.
Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-18 03:06:58 +08:00
|
|
|
return unsched ? -EINVAL : 0;
|
2010-01-18 16:58:01 +08:00
|
|
|
}
|
|
|
|
|
perf/x86/intel: Generic support for hardware TopDown metrics
Intro
=====
The TopDown Microarchitecture Analysis (TMA) Method is a structured
analysis methodology to identify critical performance bottlenecks in
out-of-order processors. Current perf has supported the method.
The method works well, but there is one problem. To collect the TopDown
events, several GP counters have to be used. If a user wants to collect
other events at the same time, the multiplexing probably be triggered,
which impacts the accuracy.
To free up the scarce GP counters, the hardware TopDown metrics feature
is introduced from Ice Lake. The hardware implements an additional
"metrics" register and a new Fixed Counter 3 that measures pipeline
"slots". The TopDown events can be calculated from them instead.
Events
======
The level 1 TopDown has four metrics. There is no event-code assigned to
the TopDown metrics. Four metric events are exported as separate perf
events, which map to the internal "metrics" counter register. Those
events do not exist in hardware, but can be allocated by the scheduler.
For the event mapping, a special 0x00 event code is used, which is
reserved for fake events. The metric events start from umask 0x10.
When setting up the metric events, they point to the Fixed Counter 3.
They have to be specially handled.
- Add the update_topdown_event() callback to read the additional metrics
MSR and generate the metrics.
- Add the set_topdown_event_period() callback to initialize metrics MSR
and the fixed counter 3.
- Add a variable n_metric_event to track the number of the accepted
metrics events. The sharing between multiple users of the same metric
without multiplexing is not allowed.
- Only enable/disable the fixed counter 3 when there are no other active
TopDown events, which avoid the unnecessary writing of the fixed
control register.
- Disable the PMU when reading the metrics event. The metrics MSR and
the fixed counter 3 are read separately. The values may be modified by
an NMI.
All four metric events don't support sampling. Since they will be
handled specially for event update, a flag PERF_X86_EVENT_TOPDOWN is
introduced to indicate this case.
The slots event can support both sampling and counting.
For counting, the flag is also applied.
For sampling, it will be handled normally as other normal events.
Groups
======
The slots event is required in a Topdown group.
To avoid reading the METRICS register multiple times, the metrics and
slots value can only be updated by slots event in a group.
All active slots and metrics events will be updated one time.
Therefore, the slots event must be before any metric events in a Topdown
group.
NMI
======
The METRICS related register may be overflow. The bit 48 of the STATUS
register will be set. If so, PERF_METRICS and Fixed counter 3 are
required to be reset. The patch also update all active slots and
metrics events in the NMI handler.
The update_topdown_event() has to read two registers separately. The
values may be modified by an NMI. PMU has to be disabled before calling
the function.
RDPMC
======
RDPMC is temporarily disabled. A later patch will enable it.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-9-kan.liang@linux.intel.com
2020-07-24 01:11:11 +08:00
|
|
|
static int add_nr_metric_event(struct cpu_hw_events *cpuc,
|
|
|
|
struct perf_event *event)
|
|
|
|
{
|
|
|
|
if (is_metric_event(event)) {
|
|
|
|
if (cpuc->n_metric == INTEL_TD_METRIC_NUM)
|
|
|
|
return -EINVAL;
|
|
|
|
cpuc->n_metric++;
|
perf/x86: Fix n_metric for cancelled txn
When a group that has TopDown members is failed to be scheduled, any
later TopDown groups will not return valid values.
Here is an example.
A background perf that occupies all the GP counters and the fixed
counter 1.
$perf stat -e "{cycles,cycles,cycles,cycles,cycles,cycles,cycles,
cycles,cycles}:D" -a
A user monitors a TopDown group. It works well, because the fixed
counter 3 and the PERF_METRICS are available.
$perf stat -x, --topdown -- ./workload
retiring,bad speculation,frontend bound,backend bound,
18.0,16.1,40.4,25.5,
Then the user tries to monitor a group that has TopDown members.
Because of the cycles event, the group is failed to be scheduled.
$perf stat -x, -e '{slots,topdown-retiring,topdown-be-bound,
topdown-fe-bound,topdown-bad-spec,cycles}'
-- ./workload
<not counted>,,slots,0,0.00,,
<not counted>,,topdown-retiring,0,0.00,,
<not counted>,,topdown-be-bound,0,0.00,,
<not counted>,,topdown-fe-bound,0,0.00,,
<not counted>,,topdown-bad-spec,0,0.00,,
<not counted>,,cycles,0,0.00,,
The user tries to monitor a TopDown group again. It doesn't work anymore.
$perf stat -x, --topdown -- ./workload
,,,,,
In a txn, cancel_txn() is to truncate the event_list for a canceled
group and update the number of events added in this transaction.
However, the number of TopDown events added in this transaction is not
updated. The kernel will probably fail to add new Topdown events.
Fixes: 7b2c05a15d29 ("perf/x86/intel: Generic support for hardware TopDown metrics")
Reported-by: Andi Kleen <ak@linux.intel.com>
Reported-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Kan Liang <kan.liang@linux.intel.com>
Link: https://lkml.kernel.org/r/20201005082611.GH2628@hirez.programming.kicks-ass.net
2020-10-05 16:10:24 +08:00
|
|
|
cpuc->n_txn_metric++;
|
perf/x86/intel: Generic support for hardware TopDown metrics
Intro
=====
The TopDown Microarchitecture Analysis (TMA) Method is a structured
analysis methodology to identify critical performance bottlenecks in
out-of-order processors. Current perf has supported the method.
The method works well, but there is one problem. To collect the TopDown
events, several GP counters have to be used. If a user wants to collect
other events at the same time, the multiplexing probably be triggered,
which impacts the accuracy.
To free up the scarce GP counters, the hardware TopDown metrics feature
is introduced from Ice Lake. The hardware implements an additional
"metrics" register and a new Fixed Counter 3 that measures pipeline
"slots". The TopDown events can be calculated from them instead.
Events
======
The level 1 TopDown has four metrics. There is no event-code assigned to
the TopDown metrics. Four metric events are exported as separate perf
events, which map to the internal "metrics" counter register. Those
events do not exist in hardware, but can be allocated by the scheduler.
For the event mapping, a special 0x00 event code is used, which is
reserved for fake events. The metric events start from umask 0x10.
When setting up the metric events, they point to the Fixed Counter 3.
They have to be specially handled.
- Add the update_topdown_event() callback to read the additional metrics
MSR and generate the metrics.
- Add the set_topdown_event_period() callback to initialize metrics MSR
and the fixed counter 3.
- Add a variable n_metric_event to track the number of the accepted
metrics events. The sharing between multiple users of the same metric
without multiplexing is not allowed.
- Only enable/disable the fixed counter 3 when there are no other active
TopDown events, which avoid the unnecessary writing of the fixed
control register.
- Disable the PMU when reading the metrics event. The metrics MSR and
the fixed counter 3 are read separately. The values may be modified by
an NMI.
All four metric events don't support sampling. Since they will be
handled specially for event update, a flag PERF_X86_EVENT_TOPDOWN is
introduced to indicate this case.
The slots event can support both sampling and counting.
For counting, the flag is also applied.
For sampling, it will be handled normally as other normal events.
Groups
======
The slots event is required in a Topdown group.
To avoid reading the METRICS register multiple times, the metrics and
slots value can only be updated by slots event in a group.
All active slots and metrics events will be updated one time.
Therefore, the slots event must be before any metric events in a Topdown
group.
NMI
======
The METRICS related register may be overflow. The bit 48 of the STATUS
register will be set. If so, PERF_METRICS and Fixed counter 3 are
required to be reset. The patch also update all active slots and
metrics events in the NMI handler.
The update_topdown_event() has to read two registers separately. The
values may be modified by an NMI. PMU has to be disabled before calling
the function.
RDPMC
======
RDPMC is temporarily disabled. A later patch will enable it.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-9-kan.liang@linux.intel.com
2020-07-24 01:11:11 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void del_nr_metric_event(struct cpu_hw_events *cpuc,
|
|
|
|
struct perf_event *event)
|
|
|
|
{
|
|
|
|
if (is_metric_event(event))
|
|
|
|
cpuc->n_metric--;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event,
|
|
|
|
int max_count, int n)
|
|
|
|
{
|
perf/x86/intel: Hybrid PMU support for perf capabilities
Some platforms, e.g. Alder Lake, have hybrid architecture. Although most
PMU capabilities are the same, there are still some unique PMU
capabilities for different hybrid PMUs. Perf should register a dedicated
pmu for each hybrid PMU.
Add a new struct x86_hybrid_pmu, which saves the dedicated pmu and
capabilities for each hybrid PMU.
The architecture MSR, MSR_IA32_PERF_CAPABILITIES, only indicates the
architecture features which are available on all hybrid PMUs. The
architecture features are stored in the global x86_pmu.intel_cap.
For Alder Lake, the model-specific features are perf metrics and
PEBS-via-PT. The corresponding bits of the global x86_pmu.intel_cap
should be 0 for these two features. Perf should not use the global
intel_cap to check the features on a hybrid system.
Add a dedicated intel_cap in the x86_hybrid_pmu to store the
model-specific capabilities. Use the dedicated intel_cap to replace
the global intel_cap for thse two features. The dedicated intel_cap
will be set in the following "Add Alder Lake Hybrid support" patch.
Add is_hybrid() to distinguish a hybrid system. ADL may have an
alternative configuration. With that configuration, the
X86_FEATURE_HYBRID_CPU is not set. Perf cannot rely on the feature bit.
Add a new static_key_false, perf_is_hybrid, to indicate a hybrid system.
It will be assigned in the following "Add Alder Lake Hybrid support"
patch as well.
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1618237865-33448-5-git-send-email-kan.liang@linux.intel.com
2021-04-12 22:30:44 +08:00
|
|
|
union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap);
|
perf/x86/intel: Generic support for hardware TopDown metrics
Intro
=====
The TopDown Microarchitecture Analysis (TMA) Method is a structured
analysis methodology to identify critical performance bottlenecks in
out-of-order processors. Current perf has supported the method.
The method works well, but there is one problem. To collect the TopDown
events, several GP counters have to be used. If a user wants to collect
other events at the same time, the multiplexing probably be triggered,
which impacts the accuracy.
To free up the scarce GP counters, the hardware TopDown metrics feature
is introduced from Ice Lake. The hardware implements an additional
"metrics" register and a new Fixed Counter 3 that measures pipeline
"slots". The TopDown events can be calculated from them instead.
Events
======
The level 1 TopDown has four metrics. There is no event-code assigned to
the TopDown metrics. Four metric events are exported as separate perf
events, which map to the internal "metrics" counter register. Those
events do not exist in hardware, but can be allocated by the scheduler.
For the event mapping, a special 0x00 event code is used, which is
reserved for fake events. The metric events start from umask 0x10.
When setting up the metric events, they point to the Fixed Counter 3.
They have to be specially handled.
- Add the update_topdown_event() callback to read the additional metrics
MSR and generate the metrics.
- Add the set_topdown_event_period() callback to initialize metrics MSR
and the fixed counter 3.
- Add a variable n_metric_event to track the number of the accepted
metrics events. The sharing between multiple users of the same metric
without multiplexing is not allowed.
- Only enable/disable the fixed counter 3 when there are no other active
TopDown events, which avoid the unnecessary writing of the fixed
control register.
- Disable the PMU when reading the metrics event. The metrics MSR and
the fixed counter 3 are read separately. The values may be modified by
an NMI.
All four metric events don't support sampling. Since they will be
handled specially for event update, a flag PERF_X86_EVENT_TOPDOWN is
introduced to indicate this case.
The slots event can support both sampling and counting.
For counting, the flag is also applied.
For sampling, it will be handled normally as other normal events.
Groups
======
The slots event is required in a Topdown group.
To avoid reading the METRICS register multiple times, the metrics and
slots value can only be updated by slots event in a group.
All active slots and metrics events will be updated one time.
Therefore, the slots event must be before any metric events in a Topdown
group.
NMI
======
The METRICS related register may be overflow. The bit 48 of the STATUS
register will be set. If so, PERF_METRICS and Fixed counter 3 are
required to be reset. The patch also update all active slots and
metrics events in the NMI handler.
The update_topdown_event() has to read two registers separately. The
values may be modified by an NMI. PMU has to be disabled before calling
the function.
RDPMC
======
RDPMC is temporarily disabled. A later patch will enable it.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-9-kan.liang@linux.intel.com
2020-07-24 01:11:11 +08:00
|
|
|
|
perf/x86/intel: Hybrid PMU support for perf capabilities
Some platforms, e.g. Alder Lake, have hybrid architecture. Although most
PMU capabilities are the same, there are still some unique PMU
capabilities for different hybrid PMUs. Perf should register a dedicated
pmu for each hybrid PMU.
Add a new struct x86_hybrid_pmu, which saves the dedicated pmu and
capabilities for each hybrid PMU.
The architecture MSR, MSR_IA32_PERF_CAPABILITIES, only indicates the
architecture features which are available on all hybrid PMUs. The
architecture features are stored in the global x86_pmu.intel_cap.
For Alder Lake, the model-specific features are perf metrics and
PEBS-via-PT. The corresponding bits of the global x86_pmu.intel_cap
should be 0 for these two features. Perf should not use the global
intel_cap to check the features on a hybrid system.
Add a dedicated intel_cap in the x86_hybrid_pmu to store the
model-specific capabilities. Use the dedicated intel_cap to replace
the global intel_cap for thse two features. The dedicated intel_cap
will be set in the following "Add Alder Lake Hybrid support" patch.
Add is_hybrid() to distinguish a hybrid system. ADL may have an
alternative configuration. With that configuration, the
X86_FEATURE_HYBRID_CPU is not set. Perf cannot rely on the feature bit.
Add a new static_key_false, perf_is_hybrid, to indicate a hybrid system.
It will be assigned in the following "Add Alder Lake Hybrid support"
patch as well.
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1618237865-33448-5-git-send-email-kan.liang@linux.intel.com
2021-04-12 22:30:44 +08:00
|
|
|
if (intel_cap.perf_metrics && add_nr_metric_event(cpuc, event))
|
perf/x86/intel: Generic support for hardware TopDown metrics
Intro
=====
The TopDown Microarchitecture Analysis (TMA) Method is a structured
analysis methodology to identify critical performance bottlenecks in
out-of-order processors. Current perf has supported the method.
The method works well, but there is one problem. To collect the TopDown
events, several GP counters have to be used. If a user wants to collect
other events at the same time, the multiplexing probably be triggered,
which impacts the accuracy.
To free up the scarce GP counters, the hardware TopDown metrics feature
is introduced from Ice Lake. The hardware implements an additional
"metrics" register and a new Fixed Counter 3 that measures pipeline
"slots". The TopDown events can be calculated from them instead.
Events
======
The level 1 TopDown has four metrics. There is no event-code assigned to
the TopDown metrics. Four metric events are exported as separate perf
events, which map to the internal "metrics" counter register. Those
events do not exist in hardware, but can be allocated by the scheduler.
For the event mapping, a special 0x00 event code is used, which is
reserved for fake events. The metric events start from umask 0x10.
When setting up the metric events, they point to the Fixed Counter 3.
They have to be specially handled.
- Add the update_topdown_event() callback to read the additional metrics
MSR and generate the metrics.
- Add the set_topdown_event_period() callback to initialize metrics MSR
and the fixed counter 3.
- Add a variable n_metric_event to track the number of the accepted
metrics events. The sharing between multiple users of the same metric
without multiplexing is not allowed.
- Only enable/disable the fixed counter 3 when there are no other active
TopDown events, which avoid the unnecessary writing of the fixed
control register.
- Disable the PMU when reading the metrics event. The metrics MSR and
the fixed counter 3 are read separately. The values may be modified by
an NMI.
All four metric events don't support sampling. Since they will be
handled specially for event update, a flag PERF_X86_EVENT_TOPDOWN is
introduced to indicate this case.
The slots event can support both sampling and counting.
For counting, the flag is also applied.
For sampling, it will be handled normally as other normal events.
Groups
======
The slots event is required in a Topdown group.
To avoid reading the METRICS register multiple times, the metrics and
slots value can only be updated by slots event in a group.
All active slots and metrics events will be updated one time.
Therefore, the slots event must be before any metric events in a Topdown
group.
NMI
======
The METRICS related register may be overflow. The bit 48 of the STATUS
register will be set. If so, PERF_METRICS and Fixed counter 3 are
required to be reset. The patch also update all active slots and
metrics events in the NMI handler.
The update_topdown_event() has to read two registers separately. The
values may be modified by an NMI. PMU has to be disabled before calling
the function.
RDPMC
======
RDPMC is temporarily disabled. A later patch will enable it.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-9-kan.liang@linux.intel.com
2020-07-24 01:11:11 +08:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (n >= max_count + cpuc->n_metric)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
cpuc->event_list[n] = event;
|
2020-10-05 16:09:06 +08:00
|
|
|
if (is_counter_pair(&event->hw)) {
|
perf/x86/intel: Generic support for hardware TopDown metrics
Intro
=====
The TopDown Microarchitecture Analysis (TMA) Method is a structured
analysis methodology to identify critical performance bottlenecks in
out-of-order processors. Current perf has supported the method.
The method works well, but there is one problem. To collect the TopDown
events, several GP counters have to be used. If a user wants to collect
other events at the same time, the multiplexing probably be triggered,
which impacts the accuracy.
To free up the scarce GP counters, the hardware TopDown metrics feature
is introduced from Ice Lake. The hardware implements an additional
"metrics" register and a new Fixed Counter 3 that measures pipeline
"slots". The TopDown events can be calculated from them instead.
Events
======
The level 1 TopDown has four metrics. There is no event-code assigned to
the TopDown metrics. Four metric events are exported as separate perf
events, which map to the internal "metrics" counter register. Those
events do not exist in hardware, but can be allocated by the scheduler.
For the event mapping, a special 0x00 event code is used, which is
reserved for fake events. The metric events start from umask 0x10.
When setting up the metric events, they point to the Fixed Counter 3.
They have to be specially handled.
- Add the update_topdown_event() callback to read the additional metrics
MSR and generate the metrics.
- Add the set_topdown_event_period() callback to initialize metrics MSR
and the fixed counter 3.
- Add a variable n_metric_event to track the number of the accepted
metrics events. The sharing between multiple users of the same metric
without multiplexing is not allowed.
- Only enable/disable the fixed counter 3 when there are no other active
TopDown events, which avoid the unnecessary writing of the fixed
control register.
- Disable the PMU when reading the metrics event. The metrics MSR and
the fixed counter 3 are read separately. The values may be modified by
an NMI.
All four metric events don't support sampling. Since they will be
handled specially for event update, a flag PERF_X86_EVENT_TOPDOWN is
introduced to indicate this case.
The slots event can support both sampling and counting.
For counting, the flag is also applied.
For sampling, it will be handled normally as other normal events.
Groups
======
The slots event is required in a Topdown group.
To avoid reading the METRICS register multiple times, the metrics and
slots value can only be updated by slots event in a group.
All active slots and metrics events will be updated one time.
Therefore, the slots event must be before any metric events in a Topdown
group.
NMI
======
The METRICS related register may be overflow. The bit 48 of the STATUS
register will be set. If so, PERF_METRICS and Fixed counter 3 are
required to be reset. The patch also update all active slots and
metrics events in the NMI handler.
The update_topdown_event() has to read two registers separately. The
values may be modified by an NMI. PMU has to be disabled before calling
the function.
RDPMC
======
RDPMC is temporarily disabled. A later patch will enable it.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-9-kan.liang@linux.intel.com
2020-07-24 01:11:11 +08:00
|
|
|
cpuc->n_pair++;
|
2020-10-05 16:09:06 +08:00
|
|
|
cpuc->n_txn_pair++;
|
|
|
|
}
|
perf/x86/intel: Generic support for hardware TopDown metrics
Intro
=====
The TopDown Microarchitecture Analysis (TMA) Method is a structured
analysis methodology to identify critical performance bottlenecks in
out-of-order processors. Current perf has supported the method.
The method works well, but there is one problem. To collect the TopDown
events, several GP counters have to be used. If a user wants to collect
other events at the same time, the multiplexing probably be triggered,
which impacts the accuracy.
To free up the scarce GP counters, the hardware TopDown metrics feature
is introduced from Ice Lake. The hardware implements an additional
"metrics" register and a new Fixed Counter 3 that measures pipeline
"slots". The TopDown events can be calculated from them instead.
Events
======
The level 1 TopDown has four metrics. There is no event-code assigned to
the TopDown metrics. Four metric events are exported as separate perf
events, which map to the internal "metrics" counter register. Those
events do not exist in hardware, but can be allocated by the scheduler.
For the event mapping, a special 0x00 event code is used, which is
reserved for fake events. The metric events start from umask 0x10.
When setting up the metric events, they point to the Fixed Counter 3.
They have to be specially handled.
- Add the update_topdown_event() callback to read the additional metrics
MSR and generate the metrics.
- Add the set_topdown_event_period() callback to initialize metrics MSR
and the fixed counter 3.
- Add a variable n_metric_event to track the number of the accepted
metrics events. The sharing between multiple users of the same metric
without multiplexing is not allowed.
- Only enable/disable the fixed counter 3 when there are no other active
TopDown events, which avoid the unnecessary writing of the fixed
control register.
- Disable the PMU when reading the metrics event. The metrics MSR and
the fixed counter 3 are read separately. The values may be modified by
an NMI.
All four metric events don't support sampling. Since they will be
handled specially for event update, a flag PERF_X86_EVENT_TOPDOWN is
introduced to indicate this case.
The slots event can support both sampling and counting.
For counting, the flag is also applied.
For sampling, it will be handled normally as other normal events.
Groups
======
The slots event is required in a Topdown group.
To avoid reading the METRICS register multiple times, the metrics and
slots value can only be updated by slots event in a group.
All active slots and metrics events will be updated one time.
Therefore, the slots event must be before any metric events in a Topdown
group.
NMI
======
The METRICS related register may be overflow. The bit 48 of the STATUS
register will be set. If so, PERF_METRICS and Fixed counter 3 are
required to be reset. The patch also update all active slots and
metrics events in the NMI handler.
The update_topdown_event() has to read two registers separately. The
values may be modified by an NMI. PMU has to be disabled before calling
the function.
RDPMC
======
RDPMC is temporarily disabled. A later patch will enable it.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-9-kan.liang@linux.intel.com
2020-07-24 01:11:11 +08:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-01-18 16:58:01 +08:00
|
|
|
/*
|
|
|
|
* dogrp: true if must collect siblings events (group)
|
|
|
|
* returns total number of events and error code
|
|
|
|
*/
|
|
|
|
static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
|
|
|
|
{
|
2021-04-12 22:30:46 +08:00
|
|
|
int num_counters = hybrid(cpuc->pmu, num_counters);
|
|
|
|
int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed);
|
2010-01-18 16:58:01 +08:00
|
|
|
struct perf_event *event;
|
|
|
|
int n, max_count;
|
|
|
|
|
2021-04-12 22:30:46 +08:00
|
|
|
max_count = num_counters + num_counters_fixed;
|
2010-01-18 16:58:01 +08:00
|
|
|
|
|
|
|
/* current number of events already accepted */
|
|
|
|
n = cpuc->n_events;
|
perf/x86/intel: Support PEBS output to PT
If PEBS declares ability to output its data to Intel PT stream, use the
aux_output attribute bit to enable PEBS data output to PT. This requires
a PT event to be present and scheduled in the same context. Unlike the
DS area, the kernel does not extract PEBS records from the PT stream to
generate corresponding records in the perf stream, because that would
require real time in-kernel PT decoding, which is not feasible. The PMI,
however, can still be used.
The output setting is per-CPU, so all PEBS events must be either writing
to PT or to the DS area, therefore, in case of conflict, the conflicting
event will fail to schedule, allowing the rotation logic to alternate
between the PEBS->PT and PEBS->DS events.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: kan.liang@linux.intel.com
Link: https://lkml.kernel.org/r/20190806084606.4021-3-alexander.shishkin@linux.intel.com
2019-08-06 16:46:01 +08:00
|
|
|
if (!cpuc->n_events)
|
|
|
|
cpuc->pebs_output = 0;
|
|
|
|
|
|
|
|
if (!cpuc->is_fake && leader->attr.precise_ip) {
|
|
|
|
/*
|
|
|
|
* For PEBS->PT, if !aux_event, the group leader (PT) went
|
|
|
|
* away, the group was broken down and this singleton event
|
|
|
|
* can't schedule any more.
|
|
|
|
*/
|
|
|
|
if (is_pebs_pt(leader) && !leader->aux_event)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* pebs_output: 0: no PEBS so far, 1: PT, 2: DS
|
|
|
|
*/
|
|
|
|
if (cpuc->pebs_output &&
|
|
|
|
cpuc->pebs_output != is_pebs_pt(leader) + 1)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
cpuc->pebs_output = is_pebs_pt(leader) + 1;
|
|
|
|
}
|
2010-01-18 16:58:01 +08:00
|
|
|
|
|
|
|
if (is_x86_event(leader)) {
|
perf/x86/intel: Generic support for hardware TopDown metrics
Intro
=====
The TopDown Microarchitecture Analysis (TMA) Method is a structured
analysis methodology to identify critical performance bottlenecks in
out-of-order processors. Current perf has supported the method.
The method works well, but there is one problem. To collect the TopDown
events, several GP counters have to be used. If a user wants to collect
other events at the same time, the multiplexing probably be triggered,
which impacts the accuracy.
To free up the scarce GP counters, the hardware TopDown metrics feature
is introduced from Ice Lake. The hardware implements an additional
"metrics" register and a new Fixed Counter 3 that measures pipeline
"slots". The TopDown events can be calculated from them instead.
Events
======
The level 1 TopDown has four metrics. There is no event-code assigned to
the TopDown metrics. Four metric events are exported as separate perf
events, which map to the internal "metrics" counter register. Those
events do not exist in hardware, but can be allocated by the scheduler.
For the event mapping, a special 0x00 event code is used, which is
reserved for fake events. The metric events start from umask 0x10.
When setting up the metric events, they point to the Fixed Counter 3.
They have to be specially handled.
- Add the update_topdown_event() callback to read the additional metrics
MSR and generate the metrics.
- Add the set_topdown_event_period() callback to initialize metrics MSR
and the fixed counter 3.
- Add a variable n_metric_event to track the number of the accepted
metrics events. The sharing between multiple users of the same metric
without multiplexing is not allowed.
- Only enable/disable the fixed counter 3 when there are no other active
TopDown events, which avoid the unnecessary writing of the fixed
control register.
- Disable the PMU when reading the metrics event. The metrics MSR and
the fixed counter 3 are read separately. The values may be modified by
an NMI.
All four metric events don't support sampling. Since they will be
handled specially for event update, a flag PERF_X86_EVENT_TOPDOWN is
introduced to indicate this case.
The slots event can support both sampling and counting.
For counting, the flag is also applied.
For sampling, it will be handled normally as other normal events.
Groups
======
The slots event is required in a Topdown group.
To avoid reading the METRICS register multiple times, the metrics and
slots value can only be updated by slots event in a group.
All active slots and metrics events will be updated one time.
Therefore, the slots event must be before any metric events in a Topdown
group.
NMI
======
The METRICS related register may be overflow. The bit 48 of the STATUS
register will be set. If so, PERF_METRICS and Fixed counter 3 are
required to be reset. The patch also update all active slots and
metrics events in the NMI handler.
The update_topdown_event() has to read two registers separately. The
values may be modified by an NMI. PMU has to be disabled before calling
the function.
RDPMC
======
RDPMC is temporarily disabled. A later patch will enable it.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-9-kan.liang@linux.intel.com
2020-07-24 01:11:11 +08:00
|
|
|
if (collect_event(cpuc, leader, max_count, n))
|
2011-11-10 00:56:37 +08:00
|
|
|
return -EINVAL;
|
2010-01-18 16:58:01 +08:00
|
|
|
n++;
|
|
|
|
}
|
perf/x86/intel: Generic support for hardware TopDown metrics
Intro
=====
The TopDown Microarchitecture Analysis (TMA) Method is a structured
analysis methodology to identify critical performance bottlenecks in
out-of-order processors. Current perf has supported the method.
The method works well, but there is one problem. To collect the TopDown
events, several GP counters have to be used. If a user wants to collect
other events at the same time, the multiplexing probably be triggered,
which impacts the accuracy.
To free up the scarce GP counters, the hardware TopDown metrics feature
is introduced from Ice Lake. The hardware implements an additional
"metrics" register and a new Fixed Counter 3 that measures pipeline
"slots". The TopDown events can be calculated from them instead.
Events
======
The level 1 TopDown has four metrics. There is no event-code assigned to
the TopDown metrics. Four metric events are exported as separate perf
events, which map to the internal "metrics" counter register. Those
events do not exist in hardware, but can be allocated by the scheduler.
For the event mapping, a special 0x00 event code is used, which is
reserved for fake events. The metric events start from umask 0x10.
When setting up the metric events, they point to the Fixed Counter 3.
They have to be specially handled.
- Add the update_topdown_event() callback to read the additional metrics
MSR and generate the metrics.
- Add the set_topdown_event_period() callback to initialize metrics MSR
and the fixed counter 3.
- Add a variable n_metric_event to track the number of the accepted
metrics events. The sharing between multiple users of the same metric
without multiplexing is not allowed.
- Only enable/disable the fixed counter 3 when there are no other active
TopDown events, which avoid the unnecessary writing of the fixed
control register.
- Disable the PMU when reading the metrics event. The metrics MSR and
the fixed counter 3 are read separately. The values may be modified by
an NMI.
All four metric events don't support sampling. Since they will be
handled specially for event update, a flag PERF_X86_EVENT_TOPDOWN is
introduced to indicate this case.
The slots event can support both sampling and counting.
For counting, the flag is also applied.
For sampling, it will be handled normally as other normal events.
Groups
======
The slots event is required in a Topdown group.
To avoid reading the METRICS register multiple times, the metrics and
slots value can only be updated by slots event in a group.
All active slots and metrics events will be updated one time.
Therefore, the slots event must be before any metric events in a Topdown
group.
NMI
======
The METRICS related register may be overflow. The bit 48 of the STATUS
register will be set. If so, PERF_METRICS and Fixed counter 3 are
required to be reset. The patch also update all active slots and
metrics events in the NMI handler.
The update_topdown_event() has to read two registers separately. The
values may be modified by an NMI. PMU has to be disabled before calling
the function.
RDPMC
======
RDPMC is temporarily disabled. A later patch will enable it.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-9-kan.liang@linux.intel.com
2020-07-24 01:11:11 +08:00
|
|
|
|
2010-01-18 16:58:01 +08:00
|
|
|
if (!dogrp)
|
|
|
|
return n;
|
|
|
|
|
2018-03-16 00:36:56 +08:00
|
|
|
for_each_sibling_event(event, leader) {
|
perf/x86/intel: Generic support for hardware TopDown metrics
Intro
=====
The TopDown Microarchitecture Analysis (TMA) Method is a structured
analysis methodology to identify critical performance bottlenecks in
out-of-order processors. Current perf has supported the method.
The method works well, but there is one problem. To collect the TopDown
events, several GP counters have to be used. If a user wants to collect
other events at the same time, the multiplexing probably be triggered,
which impacts the accuracy.
To free up the scarce GP counters, the hardware TopDown metrics feature
is introduced from Ice Lake. The hardware implements an additional
"metrics" register and a new Fixed Counter 3 that measures pipeline
"slots". The TopDown events can be calculated from them instead.
Events
======
The level 1 TopDown has four metrics. There is no event-code assigned to
the TopDown metrics. Four metric events are exported as separate perf
events, which map to the internal "metrics" counter register. Those
events do not exist in hardware, but can be allocated by the scheduler.
For the event mapping, a special 0x00 event code is used, which is
reserved for fake events. The metric events start from umask 0x10.
When setting up the metric events, they point to the Fixed Counter 3.
They have to be specially handled.
- Add the update_topdown_event() callback to read the additional metrics
MSR and generate the metrics.
- Add the set_topdown_event_period() callback to initialize metrics MSR
and the fixed counter 3.
- Add a variable n_metric_event to track the number of the accepted
metrics events. The sharing between multiple users of the same metric
without multiplexing is not allowed.
- Only enable/disable the fixed counter 3 when there are no other active
TopDown events, which avoid the unnecessary writing of the fixed
control register.
- Disable the PMU when reading the metrics event. The metrics MSR and
the fixed counter 3 are read separately. The values may be modified by
an NMI.
All four metric events don't support sampling. Since they will be
handled specially for event update, a flag PERF_X86_EVENT_TOPDOWN is
introduced to indicate this case.
The slots event can support both sampling and counting.
For counting, the flag is also applied.
For sampling, it will be handled normally as other normal events.
Groups
======
The slots event is required in a Topdown group.
To avoid reading the METRICS register multiple times, the metrics and
slots value can only be updated by slots event in a group.
All active slots and metrics events will be updated one time.
Therefore, the slots event must be before any metric events in a Topdown
group.
NMI
======
The METRICS related register may be overflow. The bit 48 of the STATUS
register will be set. If so, PERF_METRICS and Fixed counter 3 are
required to be reset. The patch also update all active slots and
metrics events in the NMI handler.
The update_topdown_event() has to read two registers separately. The
values may be modified by an NMI. PMU has to be disabled before calling
the function.
RDPMC
======
RDPMC is temporarily disabled. A later patch will enable it.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-9-kan.liang@linux.intel.com
2020-07-24 01:11:11 +08:00
|
|
|
if (!is_x86_event(event) || event->state <= PERF_EVENT_STATE_OFF)
|
2010-01-18 16:58:01 +08:00
|
|
|
continue;
|
|
|
|
|
perf/x86/intel: Generic support for hardware TopDown metrics
Intro
=====
The TopDown Microarchitecture Analysis (TMA) Method is a structured
analysis methodology to identify critical performance bottlenecks in
out-of-order processors. Current perf has supported the method.
The method works well, but there is one problem. To collect the TopDown
events, several GP counters have to be used. If a user wants to collect
other events at the same time, the multiplexing probably be triggered,
which impacts the accuracy.
To free up the scarce GP counters, the hardware TopDown metrics feature
is introduced from Ice Lake. The hardware implements an additional
"metrics" register and a new Fixed Counter 3 that measures pipeline
"slots". The TopDown events can be calculated from them instead.
Events
======
The level 1 TopDown has four metrics. There is no event-code assigned to
the TopDown metrics. Four metric events are exported as separate perf
events, which map to the internal "metrics" counter register. Those
events do not exist in hardware, but can be allocated by the scheduler.
For the event mapping, a special 0x00 event code is used, which is
reserved for fake events. The metric events start from umask 0x10.
When setting up the metric events, they point to the Fixed Counter 3.
They have to be specially handled.
- Add the update_topdown_event() callback to read the additional metrics
MSR and generate the metrics.
- Add the set_topdown_event_period() callback to initialize metrics MSR
and the fixed counter 3.
- Add a variable n_metric_event to track the number of the accepted
metrics events. The sharing between multiple users of the same metric
without multiplexing is not allowed.
- Only enable/disable the fixed counter 3 when there are no other active
TopDown events, which avoid the unnecessary writing of the fixed
control register.
- Disable the PMU when reading the metrics event. The metrics MSR and
the fixed counter 3 are read separately. The values may be modified by
an NMI.
All four metric events don't support sampling. Since they will be
handled specially for event update, a flag PERF_X86_EVENT_TOPDOWN is
introduced to indicate this case.
The slots event can support both sampling and counting.
For counting, the flag is also applied.
For sampling, it will be handled normally as other normal events.
Groups
======
The slots event is required in a Topdown group.
To avoid reading the METRICS register multiple times, the metrics and
slots value can only be updated by slots event in a group.
All active slots and metrics events will be updated one time.
Therefore, the slots event must be before any metric events in a Topdown
group.
NMI
======
The METRICS related register may be overflow. The bit 48 of the STATUS
register will be set. If so, PERF_METRICS and Fixed counter 3 are
required to be reset. The patch also update all active slots and
metrics events in the NMI handler.
The update_topdown_event() has to read two registers separately. The
values may be modified by an NMI. PMU has to be disabled before calling
the function.
RDPMC
======
RDPMC is temporarily disabled. A later patch will enable it.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-9-kan.liang@linux.intel.com
2020-07-24 01:11:11 +08:00
|
|
|
if (collect_event(cpuc, event, max_count, n))
|
2011-11-10 00:56:37 +08:00
|
|
|
return -EINVAL;
|
2010-01-18 16:58:01 +08:00
|
|
|
|
|
|
|
n++;
|
|
|
|
}
|
|
|
|
return n;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void x86_assign_hw_event(struct perf_event *event,
|
2010-02-01 20:50:01 +08:00
|
|
|
struct cpu_hw_events *cpuc, int i)
|
2010-01-18 16:58:01 +08:00
|
|
|
{
|
2010-02-01 20:50:01 +08:00
|
|
|
struct hw_perf_event *hwc = &event->hw;
|
2020-06-13 16:09:47 +08:00
|
|
|
int idx;
|
2010-02-01 20:50:01 +08:00
|
|
|
|
2020-06-13 16:09:47 +08:00
|
|
|
idx = hwc->idx = cpuc->assign[i];
|
2010-02-01 20:50:01 +08:00
|
|
|
hwc->last_cpu = smp_processor_id();
|
|
|
|
hwc->last_tag = ++cpuc->tags[i];
|
2010-01-18 16:58:01 +08:00
|
|
|
|
2021-09-08 00:39:01 +08:00
|
|
|
static_call_cond(x86_pmu_assign)(event, idx);
|
|
|
|
|
2020-06-13 16:09:47 +08:00
|
|
|
switch (hwc->idx) {
|
|
|
|
case INTEL_PMC_IDX_FIXED_BTS:
|
2020-06-13 16:09:49 +08:00
|
|
|
case INTEL_PMC_IDX_FIXED_VLBR:
|
2010-01-18 16:58:01 +08:00
|
|
|
hwc->config_base = 0;
|
|
|
|
hwc->event_base = 0;
|
2020-06-13 16:09:47 +08:00
|
|
|
break;
|
|
|
|
|
perf/x86/intel: Generic support for hardware TopDown metrics
Intro
=====
The TopDown Microarchitecture Analysis (TMA) Method is a structured
analysis methodology to identify critical performance bottlenecks in
out-of-order processors. Current perf has supported the method.
The method works well, but there is one problem. To collect the TopDown
events, several GP counters have to be used. If a user wants to collect
other events at the same time, the multiplexing probably be triggered,
which impacts the accuracy.
To free up the scarce GP counters, the hardware TopDown metrics feature
is introduced from Ice Lake. The hardware implements an additional
"metrics" register and a new Fixed Counter 3 that measures pipeline
"slots". The TopDown events can be calculated from them instead.
Events
======
The level 1 TopDown has four metrics. There is no event-code assigned to
the TopDown metrics. Four metric events are exported as separate perf
events, which map to the internal "metrics" counter register. Those
events do not exist in hardware, but can be allocated by the scheduler.
For the event mapping, a special 0x00 event code is used, which is
reserved for fake events. The metric events start from umask 0x10.
When setting up the metric events, they point to the Fixed Counter 3.
They have to be specially handled.
- Add the update_topdown_event() callback to read the additional metrics
MSR and generate the metrics.
- Add the set_topdown_event_period() callback to initialize metrics MSR
and the fixed counter 3.
- Add a variable n_metric_event to track the number of the accepted
metrics events. The sharing between multiple users of the same metric
without multiplexing is not allowed.
- Only enable/disable the fixed counter 3 when there are no other active
TopDown events, which avoid the unnecessary writing of the fixed
control register.
- Disable the PMU when reading the metrics event. The metrics MSR and
the fixed counter 3 are read separately. The values may be modified by
an NMI.
All four metric events don't support sampling. Since they will be
handled specially for event update, a flag PERF_X86_EVENT_TOPDOWN is
introduced to indicate this case.
The slots event can support both sampling and counting.
For counting, the flag is also applied.
For sampling, it will be handled normally as other normal events.
Groups
======
The slots event is required in a Topdown group.
To avoid reading the METRICS register multiple times, the metrics and
slots value can only be updated by slots event in a group.
All active slots and metrics events will be updated one time.
Therefore, the slots event must be before any metric events in a Topdown
group.
NMI
======
The METRICS related register may be overflow. The bit 48 of the STATUS
register will be set. If so, PERF_METRICS and Fixed counter 3 are
required to be reset. The patch also update all active slots and
metrics events in the NMI handler.
The update_topdown_event() has to read two registers separately. The
values may be modified by an NMI. PMU has to be disabled before calling
the function.
RDPMC
======
RDPMC is temporarily disabled. A later patch will enable it.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-9-kan.liang@linux.intel.com
2020-07-24 01:11:11 +08:00
|
|
|
case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END:
|
|
|
|
/* All the metric events are mapped onto the fixed counter 3. */
|
|
|
|
idx = INTEL_PMC_IDX_FIXED_SLOTS;
|
2020-11-21 02:31:36 +08:00
|
|
|
fallthrough;
|
2020-06-13 16:09:47 +08:00
|
|
|
case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS-1:
|
2010-01-18 16:58:01 +08:00
|
|
|
hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
|
2020-06-13 16:09:47 +08:00
|
|
|
hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 +
|
|
|
|
(idx - INTEL_PMC_IDX_FIXED);
|
2020-07-24 01:11:12 +08:00
|
|
|
hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) |
|
|
|
|
INTEL_PMC_FIXED_RDPMC_BASE;
|
2020-06-13 16:09:47 +08:00
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
2011-02-03 00:40:59 +08:00
|
|
|
hwc->config_base = x86_pmu_config_addr(hwc->idx);
|
|
|
|
hwc->event_base = x86_pmu_event_addr(hwc->idx);
|
2013-02-07 01:26:28 +08:00
|
|
|
hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
|
2020-06-13 16:09:47 +08:00
|
|
|
break;
|
2010-01-18 16:58:01 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-09-20 01:29:07 +08:00
|
|
|
/**
|
|
|
|
* x86_perf_rdpmc_index - Return PMC counter used for event
|
|
|
|
* @event: the perf_event to which the PMC counter was assigned
|
|
|
|
*
|
|
|
|
* The counter assigned to this performance event may change if interrupts
|
|
|
|
* are enabled. This counter should thus never be used while interrupts are
|
|
|
|
* enabled. Before this function is used to obtain the assigned counter the
|
|
|
|
* event should be checked for validity using, for example,
|
|
|
|
* perf_event_read_local(), within the same interrupt disabled section in
|
|
|
|
* which this counter is planned to be used.
|
|
|
|
*
|
|
|
|
* Return: The index of the performance monitoring counter assigned to
|
|
|
|
* @perf_event.
|
|
|
|
*/
|
|
|
|
int x86_perf_rdpmc_index(struct perf_event *event)
|
|
|
|
{
|
|
|
|
lockdep_assert_irqs_disabled();
|
|
|
|
|
|
|
|
return event->hw.event_base_rdpmc;
|
|
|
|
}
|
|
|
|
|
2010-02-01 20:50:01 +08:00
|
|
|
static inline int match_prev_assignment(struct hw_perf_event *hwc,
|
|
|
|
struct cpu_hw_events *cpuc,
|
|
|
|
int i)
|
|
|
|
{
|
|
|
|
return hwc->idx == cpuc->assign[i] &&
|
|
|
|
hwc->last_cpu == smp_processor_id() &&
|
|
|
|
hwc->last_tag == cpuc->tags[i];
|
|
|
|
}
|
|
|
|
|
perf: Rework the PMU methods
Replace pmu::{enable,disable,start,stop,unthrottle} with
pmu::{add,del,start,stop}, all of which take a flags argument.
The new interface extends the capability to stop a counter while
keeping it scheduled on the PMU. We replace the throttled state with
the generic stopped state.
This also allows us to efficiently stop/start counters over certain
code paths (like IRQ handlers).
It also allows scheduling a counter without it starting, allowing for
a generic frozen state (useful for rotating stopped counters).
The stopped state is implemented in two different ways, depending on
how the architecture implemented the throttled state:
1) We disable the counter:
a) the pmu has per-counter enable bits, we flip that
b) we program a NOP event, preserving the counter state
2) We store the counter state and ignore all read/overflow events
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-06-16 20:37:10 +08:00
|
|
|
static void x86_pmu_start(struct perf_event *event, int flags);
|
2010-01-25 22:58:43 +08:00
|
|
|
|
perf: Rework the PMU methods
Replace pmu::{enable,disable,start,stop,unthrottle} with
pmu::{add,del,start,stop}, all of which take a flags argument.
The new interface extends the capability to stop a counter while
keeping it scheduled on the PMU. We replace the throttled state with
the generic stopped state.
This also allows us to efficiently stop/start counters over certain
code paths (like IRQ handlers).
It also allows scheduling a counter without it starting, allowing for
a generic frozen state (useful for rotating stopped counters).
The stopped state is implemented in two different ways, depending on
how the architecture implemented the throttled state:
1) We disable the counter:
a) the pmu has per-counter enable bits, we flip that
b) we program a NOP event, preserving the counter state
2) We store the counter state and ignore all read/overflow events
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-06-16 20:37:10 +08:00
|
|
|
static void x86_pmu_enable(struct pmu *pmu)
|
2008-12-13 16:00:03 +08:00
|
|
|
{
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
2010-01-18 16:58:01 +08:00
|
|
|
struct perf_event *event;
|
|
|
|
struct hw_perf_event *hwc;
|
2010-03-26 21:08:44 +08:00
|
|
|
int i, added = cpuc->n_added;
|
2010-01-18 16:58:01 +08:00
|
|
|
|
2009-04-29 18:47:20 +08:00
|
|
|
if (!x86_pmu_initialized())
|
2008-12-15 01:36:30 +08:00
|
|
|
return;
|
2010-01-28 06:07:47 +08:00
|
|
|
|
|
|
|
if (cpuc->enabled)
|
|
|
|
return;
|
|
|
|
|
2010-01-18 16:58:01 +08:00
|
|
|
if (cpuc->n_added) {
|
2010-03-06 20:20:40 +08:00
|
|
|
int n_running = cpuc->n_events - cpuc->n_added;
|
2010-01-18 16:58:01 +08:00
|
|
|
/*
|
|
|
|
* apply assignment obtained either from
|
|
|
|
* hw_perf_group_sched_in() or x86_pmu_enable()
|
|
|
|
*
|
|
|
|
* step1: save events moving to new counters
|
|
|
|
*/
|
2010-03-06 20:20:40 +08:00
|
|
|
for (i = 0; i < n_running; i++) {
|
2010-01-18 16:58:01 +08:00
|
|
|
event = cpuc->event_list[i];
|
|
|
|
hwc = &event->hw;
|
|
|
|
|
2010-02-01 20:50:01 +08:00
|
|
|
/*
|
|
|
|
* we can avoid reprogramming counter if:
|
|
|
|
* - assigned same counter as last time
|
|
|
|
* - running on same CPU as last time
|
|
|
|
* - no other event has used the counter since
|
|
|
|
*/
|
|
|
|
if (hwc->idx == -1 ||
|
|
|
|
match_prev_assignment(hwc, cpuc, i))
|
2010-01-18 16:58:01 +08:00
|
|
|
continue;
|
|
|
|
|
perf: Rework the PMU methods
Replace pmu::{enable,disable,start,stop,unthrottle} with
pmu::{add,del,start,stop}, all of which take a flags argument.
The new interface extends the capability to stop a counter while
keeping it scheduled on the PMU. We replace the throttled state with
the generic stopped state.
This also allows us to efficiently stop/start counters over certain
code paths (like IRQ handlers).
It also allows scheduling a counter without it starting, allowing for
a generic frozen state (useful for rotating stopped counters).
The stopped state is implemented in two different ways, depending on
how the architecture implemented the throttled state:
1) We disable the counter:
a) the pmu has per-counter enable bits, we flip that
b) we program a NOP event, preserving the counter state
2) We store the counter state and ignore all read/overflow events
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-06-16 20:37:10 +08:00
|
|
|
/*
|
|
|
|
* Ensure we don't accidentally enable a stopped
|
|
|
|
* counter simply because we rescheduled.
|
|
|
|
*/
|
|
|
|
if (hwc->state & PERF_HES_STOPPED)
|
|
|
|
hwc->state |= PERF_HES_ARCH;
|
|
|
|
|
|
|
|
x86_pmu_stop(event, PERF_EF_UPDATE);
|
2010-01-18 16:58:01 +08:00
|
|
|
}
|
|
|
|
|
2014-02-24 19:26:21 +08:00
|
|
|
/*
|
|
|
|
* step2: reprogram moved events into new counters
|
|
|
|
*/
|
2010-01-18 16:58:01 +08:00
|
|
|
for (i = 0; i < cpuc->n_events; i++) {
|
|
|
|
event = cpuc->event_list[i];
|
|
|
|
hwc = &event->hw;
|
|
|
|
|
2010-03-11 20:40:30 +08:00
|
|
|
if (!match_prev_assignment(hwc, cpuc, i))
|
2010-02-01 20:50:01 +08:00
|
|
|
x86_assign_hw_event(event, cpuc, i);
|
2010-03-11 20:40:30 +08:00
|
|
|
else if (i < n_running)
|
|
|
|
continue;
|
2010-01-18 16:58:01 +08:00
|
|
|
|
perf: Rework the PMU methods
Replace pmu::{enable,disable,start,stop,unthrottle} with
pmu::{add,del,start,stop}, all of which take a flags argument.
The new interface extends the capability to stop a counter while
keeping it scheduled on the PMU. We replace the throttled state with
the generic stopped state.
This also allows us to efficiently stop/start counters over certain
code paths (like IRQ handlers).
It also allows scheduling a counter without it starting, allowing for
a generic frozen state (useful for rotating stopped counters).
The stopped state is implemented in two different ways, depending on
how the architecture implemented the throttled state:
1) We disable the counter:
a) the pmu has per-counter enable bits, we flip that
b) we program a NOP event, preserving the counter state
2) We store the counter state and ignore all read/overflow events
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-06-16 20:37:10 +08:00
|
|
|
if (hwc->state & PERF_HES_ARCH)
|
|
|
|
continue;
|
|
|
|
|
perf/x86/amd: Add AMD Fam19h Branch Sampling support
Add support for the AMD Fam19h 16-deep branch sampling feature as
described in the AMD PPR Fam19h Model 01h Revision B1. This is a model
specific extension. It is not an architected AMD feature.
The Branch Sampling (BRS) operates with a 16-deep saturating buffer in MSR
registers. There is no branch type filtering. All control flow changes are
captured. BRS relies on specific programming of the core PMU of Fam19h. In
particular, the following requirements must be met:
- the sampling period be greater than 16 (BRS depth)
- the sampling period must use a fixed and not frequency mode
BRS interacts with the NMI interrupt as well. Because enabling BRS is
expensive, it is only activated after P event occurrences, where P is the
desired sampling period. At P occurrences of the event, the counter
overflows, the CPU catches the interrupt, activates BRS for 16 branches until
it saturates, and then delivers the NMI to the kernel. Between the overflow
and the time BRS activates more branches may be executed skewing the period.
All along, the sampling event keeps counting. The skid may be attenuated by
reducing the sampling period by 16 (subsequent patch).
BRS is integrated into perf_events seamlessly via the same
PERF_RECORD_BRANCH_STACK sample format. BRS generates perf_branch_entry
records in the sampling buffer. No prediction information is supported. The
branches are stored in reverse order of execution. The most recent branch is
the first entry in each record.
No modification to the perf tool is necessary.
BRS can be used with any sampling event. However, it is recommended to use
the RETIRED_BRANCH_INSTRUCTIONS event because it matches what the BRS
captures.
$ perf record -b -c 1000037 -e cpu/event=0xc2,name=ret_br_instructions/ test
$ perf report -D
56531696056126 0x193c000 [0x1a8]: PERF_RECORD_SAMPLE(IP, 0x2): 18122/18230: 0x401d24 period: 1000037 addr: 0
... branch stack: nr:16
..... 0: 0000000000401d24 -> 0000000000401d5a 0 cycles 0
..... 1: 0000000000401d5c -> 0000000000401d24 0 cycles 0
..... 2: 0000000000401d22 -> 0000000000401d5c 0 cycles 0
..... 3: 0000000000401d5e -> 0000000000401d22 0 cycles 0
..... 4: 0000000000401d20 -> 0000000000401d5e 0 cycles 0
..... 5: 0000000000401d3e -> 0000000000401d20 0 cycles 0
..... 6: 0000000000401d42 -> 0000000000401d3e 0 cycles 0
..... 7: 0000000000401d3c -> 0000000000401d42 0 cycles 0
..... 8: 0000000000401d44 -> 0000000000401d3c 0 cycles 0
..... 9: 0000000000401d3a -> 0000000000401d44 0 cycles 0
..... 10: 0000000000401d46 -> 0000000000401d3a 0 cycles 0
..... 11: 0000000000401d38 -> 0000000000401d46 0 cycles 0
..... 12: 0000000000401d48 -> 0000000000401d38 0 cycles 0
..... 13: 0000000000401d36 -> 0000000000401d48 0 cycles 0
..... 14: 0000000000401d4a -> 0000000000401d36 0 cycles 0
..... 15: 0000000000401d34 -> 0000000000401d4a 0 cycles 0
... thread: test:18230
...... dso: test
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220322221517.2510440-4-eranian@google.com
2022-03-23 06:15:07 +08:00
|
|
|
/*
|
|
|
|
* if cpuc->enabled = 0, then no wrmsr as
|
|
|
|
* per x86_pmu_enable_event()
|
|
|
|
*/
|
perf: Rework the PMU methods
Replace pmu::{enable,disable,start,stop,unthrottle} with
pmu::{add,del,start,stop}, all of which take a flags argument.
The new interface extends the capability to stop a counter while
keeping it scheduled on the PMU. We replace the throttled state with
the generic stopped state.
This also allows us to efficiently stop/start counters over certain
code paths (like IRQ handlers).
It also allows scheduling a counter without it starting, allowing for
a generic frozen state (useful for rotating stopped counters).
The stopped state is implemented in two different ways, depending on
how the architecture implemented the throttled state:
1) We disable the counter:
a) the pmu has per-counter enable bits, we flip that
b) we program a NOP event, preserving the counter state
2) We store the counter state and ignore all read/overflow events
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-06-16 20:37:10 +08:00
|
|
|
x86_pmu_start(event, PERF_EF_RELOAD);
|
2010-01-18 16:58:01 +08:00
|
|
|
}
|
|
|
|
cpuc->n_added = 0;
|
|
|
|
perf_events_lapic_init();
|
|
|
|
}
|
2010-01-28 06:07:47 +08:00
|
|
|
|
|
|
|
cpuc->enabled = 1;
|
|
|
|
barrier();
|
|
|
|
|
2020-08-18 21:57:53 +08:00
|
|
|
static_call(x86_pmu_enable_all)(added);
|
2008-12-13 16:00:03 +08:00
|
|
|
}
|
|
|
|
|
2022-05-20 21:38:43 +08:00
|
|
|
DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
|
2008-12-03 17:39:53 +08:00
|
|
|
|
2008-12-13 16:00:03 +08:00
|
|
|
/*
|
|
|
|
* Set the next IRQ period, based on the hwc->period_left value.
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
* To be called with the event disabled in hw:
|
2008-12-13 16:00:03 +08:00
|
|
|
*/
|
2011-08-31 07:41:05 +08:00
|
|
|
int x86_perf_event_set_period(struct perf_event *event)
|
2008-12-03 17:39:53 +08:00
|
|
|
{
|
2010-03-03 03:16:01 +08:00
|
|
|
struct hw_perf_event *hwc = &event->hw;
|
2010-05-21 20:43:08 +08:00
|
|
|
s64 left = local64_read(&hwc->period_left);
|
2009-06-02 22:08:20 +08:00
|
|
|
s64 period = hwc->sample_period;
|
2010-03-08 20:51:31 +08:00
|
|
|
int ret = 0, idx = hwc->idx;
|
2008-12-13 16:00:03 +08:00
|
|
|
|
2020-06-13 16:09:47 +08:00
|
|
|
if (unlikely(!hwc->event_base))
|
2009-07-21 21:56:48 +08:00
|
|
|
return 0;
|
|
|
|
|
2008-12-13 16:00:03 +08:00
|
|
|
/*
|
tree-wide: fix assorted typos all over the place
That is "success", "unknown", "through", "performance", "[re|un]mapping"
, "access", "default", "reasonable", "[con]currently", "temperature"
, "channel", "[un]used", "application", "example","hierarchy", "therefore"
, "[over|under]flow", "contiguous", "threshold", "enough" and others.
Signed-off-by: André Goddard Rosa <andre.goddard@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
2009-11-14 23:09:05 +08:00
|
|
|
* If we are way outside a reasonable range then just skip forward:
|
2008-12-13 16:00:03 +08:00
|
|
|
*/
|
|
|
|
if (unlikely(left <= -period)) {
|
|
|
|
left = period;
|
2010-05-21 20:43:08 +08:00
|
|
|
local64_set(&hwc->period_left, left);
|
2009-06-11 03:34:59 +08:00
|
|
|
hwc->last_period = period;
|
2009-06-02 22:08:20 +08:00
|
|
|
ret = 1;
|
2008-12-13 16:00:03 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (unlikely(left <= 0)) {
|
|
|
|
left += period;
|
2010-05-21 20:43:08 +08:00
|
|
|
local64_set(&hwc->period_left, left);
|
2009-06-11 03:34:59 +08:00
|
|
|
hwc->last_period = period;
|
2009-06-02 22:08:20 +08:00
|
|
|
ret = 1;
|
2008-12-13 16:00:03 +08:00
|
|
|
}
|
2009-05-15 14:25:22 +08:00
|
|
|
/*
|
2009-09-21 17:31:35 +08:00
|
|
|
* Quirk: certain CPUs dont like it if just 1 hw_event is left:
|
2009-05-15 14:25:22 +08:00
|
|
|
*/
|
|
|
|
if (unlikely(left < 2))
|
|
|
|
left = 2;
|
2008-12-03 17:39:53 +08:00
|
|
|
|
2009-06-02 22:08:20 +08:00
|
|
|
if (left > x86_pmu.max_period)
|
|
|
|
left = x86_pmu.max_period;
|
|
|
|
|
2022-05-11 03:28:18 +08:00
|
|
|
static_call_cond(x86_pmu_limit_period)(event, &left);
|
perf/x86/intel: Add INST_RETIRED.ALL workarounds
On Broadwell INST_RETIRED.ALL cannot be used with any period
that doesn't have the lowest 6 bits cleared. And the period
should not be smaller than 128.
This is erratum BDM11 and BDM55:
http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/5th-gen-core-family-spec-update.pdf
BDM11: When using a period < 100; we may get incorrect PEBS/PMI
interrupts and/or an invalid counter state.
BDM55: When bit0-5 of the period are !0 we may get redundant PEBS
records on overflow.
Add a new callback to enforce this, and set it for Broadwell.
How does this handle the case when an app requests a specific
period with some of the bottom bits set?
Short answer:
Any useful instruction sampling period needs to be 4-6 orders
of magnitude larger than 128, as an PMI every 128 instructions
would instantly overwhelm the system and be throttled.
So the +-64 error from this is really small compared to the
period, much smaller than normal system jitter.
Long answer (by Peterz):
IFF we guarantee perf_event_attr::sample_period >= 128.
Suppose we start out with sample_period=192; then we'll set period_left
to 192, we'll end up with left = 128 (we truncate the lower bits). We
get an interrupt, find that period_left = 64 (>0 so we return 0 and
don't get an overflow handler), up that to 128. Then we trigger again,
at n=256. Then we find period_left = -64 (<=0 so we return 1 and do get
an overflow). We increment with sample_period so we get left = 128. We
fire again, at n=384, period_left = 0 (<=0 so we return 1 and get an
overflow). And on and on.
So while the individual interrupts are 'wrong' we get then with
interval=256,128 in exactly the right ratio to average out at 192. And
this works for everything >=128.
So the num_samples*fixed_period thing is still entirely correct +- 127,
which is good enough I'd say, as you already have that error anyhow.
So no need to 'fix' the tools, al we need to do is refuse to create
INST_RETIRED:ALL events with sample_period < 128.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
[ Updated comments and changelog a bit. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1424225886-18652-3-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-02-18 10:18:06 +08:00
|
|
|
|
2022-05-11 03:28:25 +08:00
|
|
|
this_cpu_write(pmc_prev_left[idx], left);
|
2008-12-13 16:00:03 +08:00
|
|
|
|
perf/x86/intel: Fix event update for auto-reload
There is a bug when reading event->count with large PEBS enabled.
Here is an example:
# ./read_count
0x71f0
0x122c0
0x1000000001c54
0x100000001257d
0x200000000bdc5
In fixed period mode, the auto-reload mechanism could be enabled for
PEBS events, but the calculation of event->count does not take the
auto-reload values into account.
Anyone who reads event->count will get the wrong result, e.g x86_pmu_read().
This bug was introduced with the auto-reload mechanism enabled since
commit:
851559e35fd5 ("perf/x86/intel: Use the PEBS auto reload mechanism when possible")
Introduce intel_pmu_save_and_restart_reload() to calculate the
event->count only for auto-reload.
Since the counter increments a negative counter value and overflows on
the sign switch, giving the interval:
[-period, 0]
the difference between two consequtive reads is:
A) value2 - value1;
when no overflows have happened in between,
B) (0 - value1) + (value2 - (-period));
when one overflow happened in between,
C) (0 - value1) + (n - 1) * (period) + (value2 - (-period));
when @n overflows happened in between.
Here A) is the obvious difference, B) is the extension to the discrete
interval, where the first term is to the top of the interval and the
second term is from the bottom of the next interval and C) the extension
to multiple intervals, where the middle term is the whole intervals
covered.
The equation for all cases is:
value2 - value1 + n * period
Previously the event->count is updated right before the sample output.
But for case A, there is no PEBS record ready. It needs to be specially
handled.
Remove the auto-reload code from x86_perf_event_set_period() since
we'll not longer call that function in this case.
Based-on-code-from: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: acme@kernel.org
Fixes: 851559e35fd5 ("perf/x86/intel: Use the PEBS auto reload mechanism when possible")
Link: http://lkml.kernel.org/r/1518474035-21006-2-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2018-02-13 06:20:31 +08:00
|
|
|
/*
|
|
|
|
* The hw event starts counting from this event offset,
|
|
|
|
* mark it to be able to extra future deltas:
|
|
|
|
*/
|
|
|
|
local64_set(&hwc->prev_count, (u64)-left);
|
2008-12-13 16:00:03 +08:00
|
|
|
|
perf/x86/intel: Fix event update for auto-reload
There is a bug when reading event->count with large PEBS enabled.
Here is an example:
# ./read_count
0x71f0
0x122c0
0x1000000001c54
0x100000001257d
0x200000000bdc5
In fixed period mode, the auto-reload mechanism could be enabled for
PEBS events, but the calculation of event->count does not take the
auto-reload values into account.
Anyone who reads event->count will get the wrong result, e.g x86_pmu_read().
This bug was introduced with the auto-reload mechanism enabled since
commit:
851559e35fd5 ("perf/x86/intel: Use the PEBS auto reload mechanism when possible")
Introduce intel_pmu_save_and_restart_reload() to calculate the
event->count only for auto-reload.
Since the counter increments a negative counter value and overflows on
the sign switch, giving the interval:
[-period, 0]
the difference between two consequtive reads is:
A) value2 - value1;
when no overflows have happened in between,
B) (0 - value1) + (value2 - (-period));
when one overflow happened in between,
C) (0 - value1) + (n - 1) * (period) + (value2 - (-period));
when @n overflows happened in between.
Here A) is the obvious difference, B) is the extension to the discrete
interval, where the first term is to the top of the interval and the
second term is from the bottom of the next interval and C) the extension
to multiple intervals, where the middle term is the whole intervals
covered.
The equation for all cases is:
value2 - value1 + n * period
Previously the event->count is updated right before the sample output.
But for case A, there is no PEBS record ready. It needs to be specially
handled.
Remove the auto-reload code from x86_perf_event_set_period() since
we'll not longer call that function in this case.
Based-on-code-from: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: acme@kernel.org
Fixes: 851559e35fd5 ("perf/x86/intel: Use the PEBS auto reload mechanism when possible")
Link: http://lkml.kernel.org/r/1518474035-21006-2-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2018-02-13 06:20:31 +08:00
|
|
|
wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
|
2010-06-03 05:23:04 +08:00
|
|
|
|
perf/x86/amd: Add support for Large Increment per Cycle Events
Description of hardware operation
---------------------------------
The core AMD PMU has a 4-bit wide per-cycle increment for each
performance monitor counter. That works for most events, but
now with AMD Family 17h and above processors, some events can
occur more than 15 times in a cycle. Those events are called
"Large Increment per Cycle" events. In order to count these
events, two adjacent h/w PMCs get their count signals merged
to form 8 bits per cycle total. In addition, the PERF_CTR count
registers are merged to be able to count up to 64 bits.
Normally, events like instructions retired, get programmed on a single
counter like so:
PERF_CTL0 (MSR 0xc0010200) 0x000000000053ff0c # event 0x0c, umask 0xff
PERF_CTR0 (MSR 0xc0010201) 0x0000800000000001 # r/w 48-bit count
The next counter at MSRs 0xc0010202-3 remains unused, or can be used
independently to count something else.
When counting Large Increment per Cycle events, such as FLOPs,
however, we now have to reserve the next counter and program the
PERF_CTL (config) register with the Merge event (0xFFF), like so:
PERF_CTL0 (msr 0xc0010200) 0x000000000053ff03 # FLOPs event, umask 0xff
PERF_CTR0 (msr 0xc0010201) 0x0000800000000001 # rd 64-bit cnt, wr lo 48b
PERF_CTL1 (msr 0xc0010202) 0x0000000f004000ff # Merge event, enable bit
PERF_CTR1 (msr 0xc0010203) 0x0000000000000000 # wr hi 16-bits count
The count is widened from the normal 48-bits to 64 bits by having the
second counter carry the higher 16 bits of the count in its lower 16
bits of its counter register.
The odd counter, e.g., PERF_CTL1, is programmed with the enabled Merge
event before the even counter, PERF_CTL0.
The Large Increment feature is available starting with Family 17h.
For more details, search any Family 17h PPR for the "Large Increment
per Cycle Events" section, e.g., section 2.1.15.3 on p. 173 in this
version:
https://www.amd.com/system/files/TechDocs/56176_ppr_Family_17h_Model_71h_B0_pub_Rev_3.06.zip
Description of software operation
---------------------------------
The following steps are taken in order to support reserving and
enabling the extra counter for Large Increment per Cycle events:
1. In the main x86 scheduler, we reduce the number of available
counters by the number of Large Increment per Cycle events being
scheduled, tracked by a new cpuc variable 'n_pair' and a new
amd_put_event_constraints_f17h(). This improves the counter
scheduler success rate.
2. In perf_assign_events(), if a counter is assigned to a Large
Increment event, we increment the current counter variable, so the
counter used for the Merge event is removed from assignment
consideration by upcoming event assignments.
3. In find_counter(), if a counter has been found for the Large
Increment event, we set the next counter as used, to prevent other
events from using it.
4. We perform steps 2 & 3 also in the x86 scheduler fastpath, i.e.,
we add Merge event accounting to the existing used_mask logic.
5. Finally, we add on the programming of Merge event to the
neighbouring PMC counters in the counter enable/disable{_all}
code paths.
Currently, software does not support a single PMU with mixed 48- and
64-bit counting, so Large increment event counts are limited to 48
bits. In set_period, we zero-out the upper 16 bits of the count, so
the hardware doesn't copy them to the even counter's higher bits.
Simple invocation example showing counting 8 FLOPs per 256-bit/%ymm
vaddps instruction executed in a loop 100 million times:
perf stat -e cpu/fp_ret_sse_avx_ops.all/,cpu/instructions/ <workload>
Performance counter stats for '<workload>':
800,000,000 cpu/fp_ret_sse_avx_ops.all/u
300,042,101 cpu/instructions/u
Prior to this patch, the reported SSE/AVX FLOPs retired count would
be wrong.
[peterz: lots of renames and edits to the code]
Signed-off-by: Kim Phillips <kim.phillips@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2019-11-15 02:37:20 +08:00
|
|
|
/*
|
2020-09-09 05:47:35 +08:00
|
|
|
* Sign extend the Merge event counter's upper 16 bits since
|
perf/x86/amd: Add support for Large Increment per Cycle Events
Description of hardware operation
---------------------------------
The core AMD PMU has a 4-bit wide per-cycle increment for each
performance monitor counter. That works for most events, but
now with AMD Family 17h and above processors, some events can
occur more than 15 times in a cycle. Those events are called
"Large Increment per Cycle" events. In order to count these
events, two adjacent h/w PMCs get their count signals merged
to form 8 bits per cycle total. In addition, the PERF_CTR count
registers are merged to be able to count up to 64 bits.
Normally, events like instructions retired, get programmed on a single
counter like so:
PERF_CTL0 (MSR 0xc0010200) 0x000000000053ff0c # event 0x0c, umask 0xff
PERF_CTR0 (MSR 0xc0010201) 0x0000800000000001 # r/w 48-bit count
The next counter at MSRs 0xc0010202-3 remains unused, or can be used
independently to count something else.
When counting Large Increment per Cycle events, such as FLOPs,
however, we now have to reserve the next counter and program the
PERF_CTL (config) register with the Merge event (0xFFF), like so:
PERF_CTL0 (msr 0xc0010200) 0x000000000053ff03 # FLOPs event, umask 0xff
PERF_CTR0 (msr 0xc0010201) 0x0000800000000001 # rd 64-bit cnt, wr lo 48b
PERF_CTL1 (msr 0xc0010202) 0x0000000f004000ff # Merge event, enable bit
PERF_CTR1 (msr 0xc0010203) 0x0000000000000000 # wr hi 16-bits count
The count is widened from the normal 48-bits to 64 bits by having the
second counter carry the higher 16 bits of the count in its lower 16
bits of its counter register.
The odd counter, e.g., PERF_CTL1, is programmed with the enabled Merge
event before the even counter, PERF_CTL0.
The Large Increment feature is available starting with Family 17h.
For more details, search any Family 17h PPR for the "Large Increment
per Cycle Events" section, e.g., section 2.1.15.3 on p. 173 in this
version:
https://www.amd.com/system/files/TechDocs/56176_ppr_Family_17h_Model_71h_B0_pub_Rev_3.06.zip
Description of software operation
---------------------------------
The following steps are taken in order to support reserving and
enabling the extra counter for Large Increment per Cycle events:
1. In the main x86 scheduler, we reduce the number of available
counters by the number of Large Increment per Cycle events being
scheduled, tracked by a new cpuc variable 'n_pair' and a new
amd_put_event_constraints_f17h(). This improves the counter
scheduler success rate.
2. In perf_assign_events(), if a counter is assigned to a Large
Increment event, we increment the current counter variable, so the
counter used for the Merge event is removed from assignment
consideration by upcoming event assignments.
3. In find_counter(), if a counter has been found for the Large
Increment event, we set the next counter as used, to prevent other
events from using it.
4. We perform steps 2 & 3 also in the x86 scheduler fastpath, i.e.,
we add Merge event accounting to the existing used_mask logic.
5. Finally, we add on the programming of Merge event to the
neighbouring PMC counters in the counter enable/disable{_all}
code paths.
Currently, software does not support a single PMU with mixed 48- and
64-bit counting, so Large increment event counts are limited to 48
bits. In set_period, we zero-out the upper 16 bits of the count, so
the hardware doesn't copy them to the even counter's higher bits.
Simple invocation example showing counting 8 FLOPs per 256-bit/%ymm
vaddps instruction executed in a loop 100 million times:
perf stat -e cpu/fp_ret_sse_avx_ops.all/,cpu/instructions/ <workload>
Performance counter stats for '<workload>':
800,000,000 cpu/fp_ret_sse_avx_ops.all/u
300,042,101 cpu/instructions/u
Prior to this patch, the reported SSE/AVX FLOPs retired count would
be wrong.
[peterz: lots of renames and edits to the code]
Signed-off-by: Kim Phillips <kim.phillips@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2019-11-15 02:37:20 +08:00
|
|
|
* we currently declare a 48-bit counter width
|
|
|
|
*/
|
|
|
|
if (is_counter_pair(hwc))
|
2020-09-09 05:47:35 +08:00
|
|
|
wrmsrl(x86_pmu_event_addr(idx + 1), 0xffff);
|
perf/x86/amd: Add support for Large Increment per Cycle Events
Description of hardware operation
---------------------------------
The core AMD PMU has a 4-bit wide per-cycle increment for each
performance monitor counter. That works for most events, but
now with AMD Family 17h and above processors, some events can
occur more than 15 times in a cycle. Those events are called
"Large Increment per Cycle" events. In order to count these
events, two adjacent h/w PMCs get their count signals merged
to form 8 bits per cycle total. In addition, the PERF_CTR count
registers are merged to be able to count up to 64 bits.
Normally, events like instructions retired, get programmed on a single
counter like so:
PERF_CTL0 (MSR 0xc0010200) 0x000000000053ff0c # event 0x0c, umask 0xff
PERF_CTR0 (MSR 0xc0010201) 0x0000800000000001 # r/w 48-bit count
The next counter at MSRs 0xc0010202-3 remains unused, or can be used
independently to count something else.
When counting Large Increment per Cycle events, such as FLOPs,
however, we now have to reserve the next counter and program the
PERF_CTL (config) register with the Merge event (0xFFF), like so:
PERF_CTL0 (msr 0xc0010200) 0x000000000053ff03 # FLOPs event, umask 0xff
PERF_CTR0 (msr 0xc0010201) 0x0000800000000001 # rd 64-bit cnt, wr lo 48b
PERF_CTL1 (msr 0xc0010202) 0x0000000f004000ff # Merge event, enable bit
PERF_CTR1 (msr 0xc0010203) 0x0000000000000000 # wr hi 16-bits count
The count is widened from the normal 48-bits to 64 bits by having the
second counter carry the higher 16 bits of the count in its lower 16
bits of its counter register.
The odd counter, e.g., PERF_CTL1, is programmed with the enabled Merge
event before the even counter, PERF_CTL0.
The Large Increment feature is available starting with Family 17h.
For more details, search any Family 17h PPR for the "Large Increment
per Cycle Events" section, e.g., section 2.1.15.3 on p. 173 in this
version:
https://www.amd.com/system/files/TechDocs/56176_ppr_Family_17h_Model_71h_B0_pub_Rev_3.06.zip
Description of software operation
---------------------------------
The following steps are taken in order to support reserving and
enabling the extra counter for Large Increment per Cycle events:
1. In the main x86 scheduler, we reduce the number of available
counters by the number of Large Increment per Cycle events being
scheduled, tracked by a new cpuc variable 'n_pair' and a new
amd_put_event_constraints_f17h(). This improves the counter
scheduler success rate.
2. In perf_assign_events(), if a counter is assigned to a Large
Increment event, we increment the current counter variable, so the
counter used for the Merge event is removed from assignment
consideration by upcoming event assignments.
3. In find_counter(), if a counter has been found for the Large
Increment event, we set the next counter as used, to prevent other
events from using it.
4. We perform steps 2 & 3 also in the x86 scheduler fastpath, i.e.,
we add Merge event accounting to the existing used_mask logic.
5. Finally, we add on the programming of Merge event to the
neighbouring PMC counters in the counter enable/disable{_all}
code paths.
Currently, software does not support a single PMU with mixed 48- and
64-bit counting, so Large increment event counts are limited to 48
bits. In set_period, we zero-out the upper 16 bits of the count, so
the hardware doesn't copy them to the even counter's higher bits.
Simple invocation example showing counting 8 FLOPs per 256-bit/%ymm
vaddps instruction executed in a loop 100 million times:
perf stat -e cpu/fp_ret_sse_avx_ops.all/,cpu/instructions/ <workload>
Performance counter stats for '<workload>':
800,000,000 cpu/fp_ret_sse_avx_ops.all/u
300,042,101 cpu/instructions/u
Prior to this patch, the reported SSE/AVX FLOPs retired count would
be wrong.
[peterz: lots of renames and edits to the code]
Signed-off-by: Kim Phillips <kim.phillips@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
2019-11-15 02:37:20 +08:00
|
|
|
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
perf_event_update_userpage(event);
|
2009-06-22 22:35:24 +08:00
|
|
|
|
2009-06-02 22:08:20 +08:00
|
|
|
return ret;
|
2008-12-22 18:10:42 +08:00
|
|
|
}
|
|
|
|
|
2011-08-31 07:41:05 +08:00
|
|
|
void x86_pmu_enable_event(struct perf_event *event)
|
2009-04-29 18:47:18 +08:00
|
|
|
{
|
2010-12-18 23:28:55 +08:00
|
|
|
if (__this_cpu_read(cpu_hw_events.enabled))
|
2010-04-14 04:23:14 +08:00
|
|
|
__x86_pmu_enable_event(&event->hw,
|
|
|
|
ARCH_PERFMON_EVENTSEL_ENABLE);
|
2008-12-03 17:39:53 +08:00
|
|
|
}
|
|
|
|
|
2009-10-06 22:42:09 +08:00
|
|
|
/*
|
perf: Rework the PMU methods
Replace pmu::{enable,disable,start,stop,unthrottle} with
pmu::{add,del,start,stop}, all of which take a flags argument.
The new interface extends the capability to stop a counter while
keeping it scheduled on the PMU. We replace the throttled state with
the generic stopped state.
This also allows us to efficiently stop/start counters over certain
code paths (like IRQ handlers).
It also allows scheduling a counter without it starting, allowing for
a generic frozen state (useful for rotating stopped counters).
The stopped state is implemented in two different ways, depending on
how the architecture implemented the throttled state:
1) We disable the counter:
a) the pmu has per-counter enable bits, we flip that
b) we program a NOP event, preserving the counter state
2) We store the counter state and ignore all read/overflow events
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-06-16 20:37:10 +08:00
|
|
|
* Add a single event to the PMU.
|
2010-01-18 16:58:01 +08:00
|
|
|
*
|
|
|
|
* The event is added to the group of enabled events
|
2019-08-16 16:43:21 +08:00
|
|
|
* but only if it can be scheduled with existing events.
|
2009-10-08 17:56:07 +08:00
|
|
|
*/
|
perf: Rework the PMU methods
Replace pmu::{enable,disable,start,stop,unthrottle} with
pmu::{add,del,start,stop}, all of which take a flags argument.
The new interface extends the capability to stop a counter while
keeping it scheduled on the PMU. We replace the throttled state with
the generic stopped state.
This also allows us to efficiently stop/start counters over certain
code paths (like IRQ handlers).
It also allows scheduling a counter without it starting, allowing for
a generic frozen state (useful for rotating stopped counters).
The stopped state is implemented in two different ways, depending on
how the architecture implemented the throttled state:
1) We disable the counter:
a) the pmu has per-counter enable bits, we flip that
b) we program a NOP event, preserving the counter state
2) We store the counter state and ignore all read/overflow events
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-06-16 20:37:10 +08:00
|
|
|
static int x86_pmu_add(struct perf_event *event, int flags)
|
2009-10-08 17:56:07 +08:00
|
|
|
{
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
2010-01-18 16:58:01 +08:00
|
|
|
struct hw_perf_event *hwc;
|
|
|
|
int assign[X86_PMC_IDX_MAX];
|
|
|
|
int n, n0, ret;
|
2009-10-08 17:56:07 +08:00
|
|
|
|
2010-01-18 16:58:01 +08:00
|
|
|
hwc = &event->hw;
|
2009-10-08 17:56:07 +08:00
|
|
|
|
2010-01-18 16:58:01 +08:00
|
|
|
n0 = cpuc->n_events;
|
2010-06-11 23:32:03 +08:00
|
|
|
ret = n = collect_events(cpuc, event, false);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2009-05-26 03:41:28 +08:00
|
|
|
|
perf: Rework the PMU methods
Replace pmu::{enable,disable,start,stop,unthrottle} with
pmu::{add,del,start,stop}, all of which take a flags argument.
The new interface extends the capability to stop a counter while
keeping it scheduled on the PMU. We replace the throttled state with
the generic stopped state.
This also allows us to efficiently stop/start counters over certain
code paths (like IRQ handlers).
It also allows scheduling a counter without it starting, allowing for
a generic frozen state (useful for rotating stopped counters).
The stopped state is implemented in two different ways, depending on
how the architecture implemented the throttled state:
1) We disable the counter:
a) the pmu has per-counter enable bits, we flip that
b) we program a NOP event, preserving the counter state
2) We store the counter state and ignore all read/overflow events
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-06-16 20:37:10 +08:00
|
|
|
hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
|
|
|
|
if (!(flags & PERF_EF_START))
|
|
|
|
hwc->state |= PERF_HES_ARCH;
|
|
|
|
|
2010-04-23 13:56:12 +08:00
|
|
|
/*
|
|
|
|
* If group events scheduling transaction was started,
|
2011-03-18 03:24:16 +08:00
|
|
|
* skip the schedulability test here, it will be performed
|
2014-02-24 19:26:21 +08:00
|
|
|
* at commit time (->commit_txn) as a whole.
|
perf/x86: Ensure perf_sched_cb_{inc,dec}() is only called from pmu::{add,del}()
Currently perf_sched_cb_{inc,dec}() are called from
pmu::{start,stop}(), which has the problem that this can happen from
NMI context, this is making it hard to optimize perf_pmu_sched_task().
Furthermore, we really only need this accounting on pmu::{add,del}(),
so doing it from pmu::{start,stop}() is doing more work than we really
need.
Introduce x86_pmu::{add,del}() and wire up the LBR and PEBS.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-07-07 00:02:43 +08:00
|
|
|
*
|
|
|
|
* If commit fails, we'll call ->del() on all events
|
|
|
|
* for which ->add() was called.
|
2010-04-23 13:56:12 +08:00
|
|
|
*/
|
2015-09-04 11:07:53 +08:00
|
|
|
if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
|
2010-06-11 23:32:03 +08:00
|
|
|
goto done_collect;
|
2010-04-23 13:56:12 +08:00
|
|
|
|
2020-08-18 21:57:53 +08:00
|
|
|
ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign);
|
2010-01-18 16:58:01 +08:00
|
|
|
if (ret)
|
2010-06-11 23:32:03 +08:00
|
|
|
goto out;
|
2010-01-18 16:58:01 +08:00
|
|
|
/*
|
|
|
|
* copy new assignment, now we know it is possible
|
|
|
|
* will be used by hw_perf_enable()
|
|
|
|
*/
|
|
|
|
memcpy(cpuc->assign, assign, n*sizeof(int));
|
2008-12-09 18:40:46 +08:00
|
|
|
|
2010-06-11 23:32:03 +08:00
|
|
|
done_collect:
|
2014-02-24 19:26:21 +08:00
|
|
|
/*
|
|
|
|
* Commit the collect_events() state. See x86_pmu_del() and
|
|
|
|
* x86_pmu_*_txn().
|
|
|
|
*/
|
2010-01-18 16:58:01 +08:00
|
|
|
cpuc->n_events = n;
|
2010-03-06 20:49:56 +08:00
|
|
|
cpuc->n_added += n - n0;
|
perf_events: Fix event scheduling issues introduced by transactional API
The transactional API patch between the generic and model-specific
code introduced several important bugs with event scheduling, at
least on X86. If you had pinned events, e.g., watchdog, and were
over-committing the PMU, you would get bogus counts. The bug was
showing up on Intel CPU because events would move around more
often that on AMD. But the problem also existed on AMD, though
harder to expose.
The issues were:
- group_sched_in() was missing a cancel_txn() in the error path
- cpuc->n_added was not properly maintained, leading to missing
actions in hw_perf_enable(), i.e., n_running being 0. You cannot
update n_added until you know the transaction has succeeded. In
case of failed transaction n_added was not adjusted back.
- in case of failed transactions, event_sched_out() was called
and eventually invoked x86_disable_event() to touch the HW reg.
But with transactions, on X86, event_sched_in() does not touch
HW registers, it simply collects events into a list. Thus, you
could end up calling x86_disable_event() on a counter which
did not correspond to the current event when idx != -1.
The patch modifies the generic and X86 code to avoid all those problems.
First, we keep track of the number of events added last. In case the
transaction fails, we substract them from n_added. This approach is
necessary (as opposed to delaying updates to n_added) because not all
event updates use the transaction API, e.g., single events.
Second, we encapsulate the event_sched_in() and event_sched_out() in
group_sched_in() inside the transaction. That makes the operations
symmetrical and you can also detect that you are inside a transaction
and skip the HW reg access by checking cpuc->group_flag.
With this patch, you can now overcommit the PMU even with pinned
system-wide events present and still get valid counts.
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1274796225.5882.1389.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-05-25 22:23:10 +08:00
|
|
|
cpuc->n_txn += n - n0;
|
2008-12-21 20:50:42 +08:00
|
|
|
|
2020-08-18 21:57:53 +08:00
|
|
|
/*
|
|
|
|
* This is before x86_pmu_enable() will call x86_pmu_start(),
|
|
|
|
* so we enable LBRs before an event needs them etc..
|
|
|
|
*/
|
|
|
|
static_call_cond(x86_pmu_add)(event);
|
perf/x86: Ensure perf_sched_cb_{inc,dec}() is only called from pmu::{add,del}()
Currently perf_sched_cb_{inc,dec}() are called from
pmu::{start,stop}(), which has the problem that this can happen from
NMI context, this is making it hard to optimize perf_pmu_sched_task().
Furthermore, we really only need this accounting on pmu::{add,del}(),
so doing it from pmu::{start,stop}() is doing more work than we really
need.
Introduce x86_pmu::{add,del}() and wire up the LBR and PEBS.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-07-07 00:02:43 +08:00
|
|
|
|
2010-06-11 23:32:03 +08:00
|
|
|
ret = 0;
|
|
|
|
out:
|
|
|
|
return ret;
|
2008-12-03 17:39:53 +08:00
|
|
|
}
|
|
|
|
|
perf: Rework the PMU methods
Replace pmu::{enable,disable,start,stop,unthrottle} with
pmu::{add,del,start,stop}, all of which take a flags argument.
The new interface extends the capability to stop a counter while
keeping it scheduled on the PMU. We replace the throttled state with
the generic stopped state.
This also allows us to efficiently stop/start counters over certain
code paths (like IRQ handlers).
It also allows scheduling a counter without it starting, allowing for
a generic frozen state (useful for rotating stopped counters).
The stopped state is implemented in two different ways, depending on
how the architecture implemented the throttled state:
1) We disable the counter:
a) the pmu has per-counter enable bits, we flip that
b) we program a NOP event, preserving the counter state
2) We store the counter state and ignore all read/overflow events
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-06-16 20:37:10 +08:00
|
|
|
static void x86_pmu_start(struct perf_event *event, int flags)
|
2010-02-08 23:06:01 +08:00
|
|
|
{
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
2010-03-06 20:19:24 +08:00
|
|
|
int idx = event->hw.idx;
|
|
|
|
|
perf: Rework the PMU methods
Replace pmu::{enable,disable,start,stop,unthrottle} with
pmu::{add,del,start,stop}, all of which take a flags argument.
The new interface extends the capability to stop a counter while
keeping it scheduled on the PMU. We replace the throttled state with
the generic stopped state.
This also allows us to efficiently stop/start counters over certain
code paths (like IRQ handlers).
It also allows scheduling a counter without it starting, allowing for
a generic frozen state (useful for rotating stopped counters).
The stopped state is implemented in two different ways, depending on
how the architecture implemented the throttled state:
1) We disable the counter:
a) the pmu has per-counter enable bits, we flip that
b) we program a NOP event, preserving the counter state
2) We store the counter state and ignore all read/overflow events
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-06-16 20:37:10 +08:00
|
|
|
if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (WARN_ON_ONCE(idx == -1))
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (flags & PERF_EF_RELOAD) {
|
|
|
|
WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
|
2022-05-11 03:27:22 +08:00
|
|
|
static_call(x86_pmu_set_period)(event);
|
perf: Rework the PMU methods
Replace pmu::{enable,disable,start,stop,unthrottle} with
pmu::{add,del,start,stop}, all of which take a flags argument.
The new interface extends the capability to stop a counter while
keeping it scheduled on the PMU. We replace the throttled state with
the generic stopped state.
This also allows us to efficiently stop/start counters over certain
code paths (like IRQ handlers).
It also allows scheduling a counter without it starting, allowing for
a generic frozen state (useful for rotating stopped counters).
The stopped state is implemented in two different ways, depending on
how the architecture implemented the throttled state:
1) We disable the counter:
a) the pmu has per-counter enable bits, we flip that
b) we program a NOP event, preserving the counter state
2) We store the counter state and ignore all read/overflow events
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-06-16 20:37:10 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
event->hw.state = 0;
|
2010-02-08 23:06:01 +08:00
|
|
|
|
2010-03-06 20:19:24 +08:00
|
|
|
cpuc->events[idx] = event;
|
|
|
|
__set_bit(idx, cpuc->active_mask);
|
2020-08-18 21:57:53 +08:00
|
|
|
static_call(x86_pmu_enable)(event);
|
2010-03-06 20:19:24 +08:00
|
|
|
perf_event_update_userpage(event);
|
2009-05-25 23:39:05 +08:00
|
|
|
}
|
|
|
|
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
void perf_event_print_debug(void)
|
2008-12-03 17:39:53 +08:00
|
|
|
{
|
2008-12-22 18:10:42 +08:00
|
|
|
u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
|
2015-02-28 01:48:31 +08:00
|
|
|
u64 pebs, debugctl;
|
2021-04-12 22:30:46 +08:00
|
|
|
int cpu = smp_processor_id();
|
|
|
|
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
|
|
|
|
int num_counters = hybrid(cpuc->pmu, num_counters);
|
|
|
|
int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed);
|
2021-04-12 22:30:49 +08:00
|
|
|
struct event_constraint *pebs_constraints = hybrid(cpuc->pmu, pebs_constraints);
|
2009-05-13 14:12:51 +08:00
|
|
|
unsigned long flags;
|
2021-04-12 22:30:46 +08:00
|
|
|
int idx;
|
2008-12-09 19:18:18 +08:00
|
|
|
|
2021-04-12 22:30:46 +08:00
|
|
|
if (!num_counters)
|
2008-12-09 19:18:18 +08:00
|
|
|
return;
|
2008-12-03 17:39:53 +08:00
|
|
|
|
2009-05-13 14:12:51 +08:00
|
|
|
local_irq_save(flags);
|
2008-12-03 17:39:53 +08:00
|
|
|
|
2009-04-29 18:47:13 +08:00
|
|
|
if (x86_pmu.version >= 2) {
|
2009-02-28 21:15:39 +08:00
|
|
|
rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
|
|
|
|
rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
|
|
|
|
rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
|
|
|
|
rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
|
|
|
|
|
|
|
|
pr_info("\n");
|
|
|
|
pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
|
|
|
|
pr_info("CPU#%d: status: %016llx\n", cpu, status);
|
|
|
|
pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
|
|
|
|
pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
|
2021-04-12 22:30:49 +08:00
|
|
|
if (pebs_constraints) {
|
2015-02-28 01:48:32 +08:00
|
|
|
rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
|
|
|
|
pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs);
|
|
|
|
}
|
2015-02-28 01:48:31 +08:00
|
|
|
if (x86_pmu.lbr_nr) {
|
|
|
|
rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
|
|
|
|
pr_info("CPU#%d: debugctl: %016llx\n", cpu, debugctl);
|
|
|
|
}
|
x86: AMD Support for perf_counter
Supported basic performance counter for AMD K7 and later:
$ perfstat -e 0,1,2,3,4,5,-1,-2,-3,-4,-5 ls > /dev/null
Performance counter stats for 'ls':
12.298610 task clock ticks (msecs)
3298477 CPU cycles (events)
1406354 instructions (events)
749035 cache references (events)
16939 cache misses (events)
100589 branches (events)
11159 branch misses (events)
7.627540 cpu clock ticks (msecs)
12.298610 task clock ticks (msecs)
500 pagefaults (events)
6 context switches (events)
3 CPU migrations (events)
Wall-clock time elapsed: 8.672290 msecs
Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-02-27 22:45:14 +08:00
|
|
|
}
|
2010-03-08 20:51:31 +08:00
|
|
|
pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
|
2008-12-03 17:39:53 +08:00
|
|
|
|
2021-04-12 22:30:46 +08:00
|
|
|
for (idx = 0; idx < num_counters; idx++) {
|
2011-02-03 00:40:57 +08:00
|
|
|
rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
|
|
|
|
rdmsrl(x86_pmu_event_addr(idx), pmc_count);
|
2008-12-03 17:39:53 +08:00
|
|
|
|
2009-06-24 14:13:48 +08:00
|
|
|
prev_left = per_cpu(pmc_prev_left[idx], cpu);
|
2008-12-03 17:39:53 +08:00
|
|
|
|
2009-02-28 21:15:39 +08:00
|
|
|
pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
|
2008-12-03 17:39:53 +08:00
|
|
|
cpu, idx, pmc_ctrl);
|
2009-02-28 21:15:39 +08:00
|
|
|
pr_info("CPU#%d: gen-PMC%d count: %016llx\n",
|
2008-12-03 17:39:53 +08:00
|
|
|
cpu, idx, pmc_count);
|
2009-02-28 21:15:39 +08:00
|
|
|
pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
|
2008-12-13 16:00:03 +08:00
|
|
|
cpu, idx, prev_left);
|
2008-12-03 17:39:53 +08:00
|
|
|
}
|
2021-04-12 22:30:46 +08:00
|
|
|
for (idx = 0; idx < num_counters_fixed; idx++) {
|
2021-04-12 22:30:45 +08:00
|
|
|
if (fixed_counter_disabled(idx, cpuc->pmu))
|
2021-01-29 06:40:11 +08:00
|
|
|
continue;
|
2008-12-22 18:10:42 +08:00
|
|
|
rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
|
|
|
|
|
2009-02-28 21:15:39 +08:00
|
|
|
pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
|
2008-12-22 18:10:42 +08:00
|
|
|
cpu, idx, pmc_count);
|
|
|
|
}
|
2009-05-13 14:12:51 +08:00
|
|
|
local_irq_restore(flags);
|
2008-12-03 17:39:53 +08:00
|
|
|
}
|
|
|
|
|
2011-08-31 07:41:05 +08:00
|
|
|
void x86_pmu_stop(struct perf_event *event, int flags)
|
2008-12-03 17:39:53 +08:00
|
|
|
{
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
struct hw_perf_event *hwc = &event->hw;
|
2008-12-03 17:39:53 +08:00
|
|
|
|
2019-04-02 23:21:18 +08:00
|
|
|
if (test_bit(hwc->idx, cpuc->active_mask)) {
|
2020-08-18 21:57:53 +08:00
|
|
|
static_call(x86_pmu_disable)(event);
|
2019-04-02 23:21:18 +08:00
|
|
|
__clear_bit(hwc->idx, cpuc->active_mask);
|
perf: Rework the PMU methods
Replace pmu::{enable,disable,start,stop,unthrottle} with
pmu::{add,del,start,stop}, all of which take a flags argument.
The new interface extends the capability to stop a counter while
keeping it scheduled on the PMU. We replace the throttled state with
the generic stopped state.
This also allows us to efficiently stop/start counters over certain
code paths (like IRQ handlers).
It also allows scheduling a counter without it starting, allowing for
a generic frozen state (useful for rotating stopped counters).
The stopped state is implemented in two different ways, depending on
how the architecture implemented the throttled state:
1) We disable the counter:
a) the pmu has per-counter enable bits, we flip that
b) we program a NOP event, preserving the counter state
2) We store the counter state and ignore all read/overflow events
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-06-16 20:37:10 +08:00
|
|
|
cpuc->events[hwc->idx] = NULL;
|
|
|
|
WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
|
|
|
|
hwc->state |= PERF_HES_STOPPED;
|
|
|
|
}
|
2009-07-21 21:56:48 +08:00
|
|
|
|
perf: Rework the PMU methods
Replace pmu::{enable,disable,start,stop,unthrottle} with
pmu::{add,del,start,stop}, all of which take a flags argument.
The new interface extends the capability to stop a counter while
keeping it scheduled on the PMU. We replace the throttled state with
the generic stopped state.
This also allows us to efficiently stop/start counters over certain
code paths (like IRQ handlers).
It also allows scheduling a counter without it starting, allowing for
a generic frozen state (useful for rotating stopped counters).
The stopped state is implemented in two different ways, depending on
how the architecture implemented the throttled state:
1) We disable the counter:
a) the pmu has per-counter enable bits, we flip that
b) we program a NOP event, preserving the counter state
2) We store the counter state and ignore all read/overflow events
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-06-16 20:37:10 +08:00
|
|
|
if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
|
|
|
|
/*
|
|
|
|
* Drain the remaining delta count out of a event
|
|
|
|
* that we are disabling:
|
|
|
|
*/
|
2022-05-11 03:27:22 +08:00
|
|
|
static_call(x86_pmu_update)(event);
|
perf: Rework the PMU methods
Replace pmu::{enable,disable,start,stop,unthrottle} with
pmu::{add,del,start,stop}, all of which take a flags argument.
The new interface extends the capability to stop a counter while
keeping it scheduled on the PMU. We replace the throttled state with
the generic stopped state.
This also allows us to efficiently stop/start counters over certain
code paths (like IRQ handlers).
It also allows scheduling a counter without it starting, allowing for
a generic frozen state (useful for rotating stopped counters).
The stopped state is implemented in two different ways, depending on
how the architecture implemented the throttled state:
1) We disable the counter:
a) the pmu has per-counter enable bits, we flip that
b) we program a NOP event, preserving the counter state
2) We store the counter state and ignore all read/overflow events
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-06-16 20:37:10 +08:00
|
|
|
hwc->state |= PERF_HES_UPTODATE;
|
|
|
|
}
|
2010-01-25 22:58:43 +08:00
|
|
|
}
|
|
|
|
|
perf: Rework the PMU methods
Replace pmu::{enable,disable,start,stop,unthrottle} with
pmu::{add,del,start,stop}, all of which take a flags argument.
The new interface extends the capability to stop a counter while
keeping it scheduled on the PMU. We replace the throttled state with
the generic stopped state.
This also allows us to efficiently stop/start counters over certain
code paths (like IRQ handlers).
It also allows scheduling a counter without it starting, allowing for
a generic frozen state (useful for rotating stopped counters).
The stopped state is implemented in two different ways, depending on
how the architecture implemented the throttled state:
1) We disable the counter:
a) the pmu has per-counter enable bits, we flip that
b) we program a NOP event, preserving the counter state
2) We store the counter state and ignore all read/overflow events
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-06-16 20:37:10 +08:00
|
|
|
static void x86_pmu_del(struct perf_event *event, int flags)
|
2010-01-25 22:58:43 +08:00
|
|
|
{
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
perf/x86/intel: Hybrid PMU support for perf capabilities
Some platforms, e.g. Alder Lake, have hybrid architecture. Although most
PMU capabilities are the same, there are still some unique PMU
capabilities for different hybrid PMUs. Perf should register a dedicated
pmu for each hybrid PMU.
Add a new struct x86_hybrid_pmu, which saves the dedicated pmu and
capabilities for each hybrid PMU.
The architecture MSR, MSR_IA32_PERF_CAPABILITIES, only indicates the
architecture features which are available on all hybrid PMUs. The
architecture features are stored in the global x86_pmu.intel_cap.
For Alder Lake, the model-specific features are perf metrics and
PEBS-via-PT. The corresponding bits of the global x86_pmu.intel_cap
should be 0 for these two features. Perf should not use the global
intel_cap to check the features on a hybrid system.
Add a dedicated intel_cap in the x86_hybrid_pmu to store the
model-specific capabilities. Use the dedicated intel_cap to replace
the global intel_cap for thse two features. The dedicated intel_cap
will be set in the following "Add Alder Lake Hybrid support" patch.
Add is_hybrid() to distinguish a hybrid system. ADL may have an
alternative configuration. With that configuration, the
X86_FEATURE_HYBRID_CPU is not set. Perf cannot rely on the feature bit.
Add a new static_key_false, perf_is_hybrid, to indicate a hybrid system.
It will be assigned in the following "Add Alder Lake Hybrid support"
patch as well.
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1618237865-33448-5-git-send-email-kan.liang@linux.intel.com
2021-04-12 22:30:44 +08:00
|
|
|
union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap);
|
2010-01-25 22:58:43 +08:00
|
|
|
int i;
|
|
|
|
|
perf_events: Fix event scheduling issues introduced by transactional API
The transactional API patch between the generic and model-specific
code introduced several important bugs with event scheduling, at
least on X86. If you had pinned events, e.g., watchdog, and were
over-committing the PMU, you would get bogus counts. The bug was
showing up on Intel CPU because events would move around more
often that on AMD. But the problem also existed on AMD, though
harder to expose.
The issues were:
- group_sched_in() was missing a cancel_txn() in the error path
- cpuc->n_added was not properly maintained, leading to missing
actions in hw_perf_enable(), i.e., n_running being 0. You cannot
update n_added until you know the transaction has succeeded. In
case of failed transaction n_added was not adjusted back.
- in case of failed transactions, event_sched_out() was called
and eventually invoked x86_disable_event() to touch the HW reg.
But with transactions, on X86, event_sched_in() does not touch
HW registers, it simply collects events into a list. Thus, you
could end up calling x86_disable_event() on a counter which
did not correspond to the current event when idx != -1.
The patch modifies the generic and X86 code to avoid all those problems.
First, we keep track of the number of events added last. In case the
transaction fails, we substract them from n_added. This approach is
necessary (as opposed to delaying updates to n_added) because not all
event updates use the transaction API, e.g., single events.
Second, we encapsulate the event_sched_in() and event_sched_out() in
group_sched_in() inside the transaction. That makes the operations
symmetrical and you can also detect that you are inside a transaction
and skip the HW reg access by checking cpuc->group_flag.
With this patch, you can now overcommit the PMU even with pinned
system-wide events present and still get valid counts.
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1274796225.5882.1389.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-05-25 22:23:10 +08:00
|
|
|
/*
|
perf/x86: Ensure perf_sched_cb_{inc,dec}() is only called from pmu::{add,del}()
Currently perf_sched_cb_{inc,dec}() are called from
pmu::{start,stop}(), which has the problem that this can happen from
NMI context, this is making it hard to optimize perf_pmu_sched_task().
Furthermore, we really only need this accounting on pmu::{add,del}(),
so doing it from pmu::{start,stop}() is doing more work than we really
need.
Introduce x86_pmu::{add,del}() and wire up the LBR and PEBS.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-07-07 00:02:43 +08:00
|
|
|
* If we're called during a txn, we only need to undo x86_pmu.add.
|
perf_events: Fix event scheduling issues introduced by transactional API
The transactional API patch between the generic and model-specific
code introduced several important bugs with event scheduling, at
least on X86. If you had pinned events, e.g., watchdog, and were
over-committing the PMU, you would get bogus counts. The bug was
showing up on Intel CPU because events would move around more
often that on AMD. But the problem also existed on AMD, though
harder to expose.
The issues were:
- group_sched_in() was missing a cancel_txn() in the error path
- cpuc->n_added was not properly maintained, leading to missing
actions in hw_perf_enable(), i.e., n_running being 0. You cannot
update n_added until you know the transaction has succeeded. In
case of failed transaction n_added was not adjusted back.
- in case of failed transactions, event_sched_out() was called
and eventually invoked x86_disable_event() to touch the HW reg.
But with transactions, on X86, event_sched_in() does not touch
HW registers, it simply collects events into a list. Thus, you
could end up calling x86_disable_event() on a counter which
did not correspond to the current event when idx != -1.
The patch modifies the generic and X86 code to avoid all those problems.
First, we keep track of the number of events added last. In case the
transaction fails, we substract them from n_added. This approach is
necessary (as opposed to delaying updates to n_added) because not all
event updates use the transaction API, e.g., single events.
Second, we encapsulate the event_sched_in() and event_sched_out() in
group_sched_in() inside the transaction. That makes the operations
symmetrical and you can also detect that you are inside a transaction
and skip the HW reg access by checking cpuc->group_flag.
With this patch, you can now overcommit the PMU even with pinned
system-wide events present and still get valid counts.
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1274796225.5882.1389.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-05-25 22:23:10 +08:00
|
|
|
* The events never got scheduled and ->cancel_txn will truncate
|
|
|
|
* the event_list.
|
2014-02-24 19:26:21 +08:00
|
|
|
*
|
|
|
|
* XXX assumes any ->del() called during a TXN will only be on
|
|
|
|
* an event added during that same TXN.
|
perf_events: Fix event scheduling issues introduced by transactional API
The transactional API patch between the generic and model-specific
code introduced several important bugs with event scheduling, at
least on X86. If you had pinned events, e.g., watchdog, and were
over-committing the PMU, you would get bogus counts. The bug was
showing up on Intel CPU because events would move around more
often that on AMD. But the problem also existed on AMD, though
harder to expose.
The issues were:
- group_sched_in() was missing a cancel_txn() in the error path
- cpuc->n_added was not properly maintained, leading to missing
actions in hw_perf_enable(), i.e., n_running being 0. You cannot
update n_added until you know the transaction has succeeded. In
case of failed transaction n_added was not adjusted back.
- in case of failed transactions, event_sched_out() was called
and eventually invoked x86_disable_event() to touch the HW reg.
But with transactions, on X86, event_sched_in() does not touch
HW registers, it simply collects events into a list. Thus, you
could end up calling x86_disable_event() on a counter which
did not correspond to the current event when idx != -1.
The patch modifies the generic and X86 code to avoid all those problems.
First, we keep track of the number of events added last. In case the
transaction fails, we substract them from n_added. This approach is
necessary (as opposed to delaying updates to n_added) because not all
event updates use the transaction API, e.g., single events.
Second, we encapsulate the event_sched_in() and event_sched_out() in
group_sched_in() inside the transaction. That makes the operations
symmetrical and you can also detect that you are inside a transaction
and skip the HW reg access by checking cpuc->group_flag.
With this patch, you can now overcommit the PMU even with pinned
system-wide events present and still get valid counts.
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1274796225.5882.1389.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-05-25 22:23:10 +08:00
|
|
|
*/
|
2015-09-04 11:07:53 +08:00
|
|
|
if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
|
perf/x86: Ensure perf_sched_cb_{inc,dec}() is only called from pmu::{add,del}()
Currently perf_sched_cb_{inc,dec}() are called from
pmu::{start,stop}(), which has the problem that this can happen from
NMI context, this is making it hard to optimize perf_pmu_sched_task().
Furthermore, we really only need this accounting on pmu::{add,del}(),
so doing it from pmu::{start,stop}() is doing more work than we really
need.
Introduce x86_pmu::{add,del}() and wire up the LBR and PEBS.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-07-07 00:02:43 +08:00
|
|
|
goto do_del;
|
perf_events: Fix event scheduling issues introduced by transactional API
The transactional API patch between the generic and model-specific
code introduced several important bugs with event scheduling, at
least on X86. If you had pinned events, e.g., watchdog, and were
over-committing the PMU, you would get bogus counts. The bug was
showing up on Intel CPU because events would move around more
often that on AMD. But the problem also existed on AMD, though
harder to expose.
The issues were:
- group_sched_in() was missing a cancel_txn() in the error path
- cpuc->n_added was not properly maintained, leading to missing
actions in hw_perf_enable(), i.e., n_running being 0. You cannot
update n_added until you know the transaction has succeeded. In
case of failed transaction n_added was not adjusted back.
- in case of failed transactions, event_sched_out() was called
and eventually invoked x86_disable_event() to touch the HW reg.
But with transactions, on X86, event_sched_in() does not touch
HW registers, it simply collects events into a list. Thus, you
could end up calling x86_disable_event() on a counter which
did not correspond to the current event when idx != -1.
The patch modifies the generic and X86 code to avoid all those problems.
First, we keep track of the number of events added last. In case the
transaction fails, we substract them from n_added. This approach is
necessary (as opposed to delaying updates to n_added) because not all
event updates use the transaction API, e.g., single events.
Second, we encapsulate the event_sched_in() and event_sched_out() in
group_sched_in() inside the transaction. That makes the operations
symmetrical and you can also detect that you are inside a transaction
and skip the HW reg access by checking cpuc->group_flag.
With this patch, you can now overcommit the PMU even with pinned
system-wide events present and still get valid counts.
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1274796225.5882.1389.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-05-25 22:23:10 +08:00
|
|
|
|
perf/x86: Reset the dirty counter to prevent the leak for an RDPMC task
The counter value of a perf task may leak to another RDPMC task.
For example, a perf stat task as below is running on CPU 0.
perf stat -e 'branches,cycles' -- taskset -c 0 ./workload
In the meantime, an RDPMC task, which is also running on CPU 0, may read
the GP counters periodically. (The RDPMC task creates a fixed event,
but read four GP counters.)
$./rdpmc_read_all_counters
index 0x0 value 0x8001e5970f99
index 0x1 value 0x8005d750edb6
index 0x2 value 0x0
index 0x3 value 0x0
index 0x0 value 0x8002358e48a5
index 0x1 value 0x8006bd1e3bc9
index 0x2 value 0x0
index 0x3 value 0x0
It is a potential security issue. Once the attacker knows what the other
thread is counting. The PerfMon counter can be used as a side-channel to
attack cryptosystems.
The counter value of the perf stat task leaks to the RDPMC task because
perf never clears the counter when it's stopped.
Three methods were considered to address the issue.
- Unconditionally reset the counter in x86_pmu_del(). It can bring extra
overhead even when there is no RDPMC task running.
- Only reset the un-assigned dirty counters when the RDPMC task is
scheduled in via sched_task(). It fails for the below case.
Thread A Thread B
clone(CLONE_THREAD) --->
set_affine(0)
set_affine(1)
while (!event-enabled)
;
event = perf_event_open()
mmap(event)
ioctl(event, IOC_ENABLE); --->
RDPMC
Counters are still leaked to the thread B.
- Only reset the un-assigned dirty counters before updating the CR4.PCE
bit. The method is implemented here.
The dirty counter is a counter, on which the assigned event has been
deleted, but the counter is not reset. To track the dirty counters,
add a 'dirty' variable in the struct cpu_hw_events.
The security issue can only be found with an RDPMC task. To enable the
RDMPC, the CR4.PCE bit has to be updated. Add a
perf_clear_dirty_counters() right before updating the CR4.PCE bit to
clear the existing dirty counters. Only the current un-assigned dirty
counters are reset, because the RDPMC assigned dirty counters will be
updated soon.
After applying the patch,
$ ./rdpmc_read_all_counters
index 0x0 value 0x0
index 0x1 value 0x0
index 0x2 value 0x0
index 0x3 value 0x0
index 0x0 value 0x0
index 0x1 value 0x0
index 0x2 value 0x0
index 0x3 value 0x0
Performance
The performance of a context switch only be impacted when there are two
or more perf users and one of the users must be an RDPMC user. In other
cases, there is no performance impact.
The worst-case occurs when there are two users: the RDPMC user only
uses one counter; while the other user uses all available counters.
When the RDPMC task is scheduled in, all the counters, other than the
RDPMC assigned one, have to be reset.
Test results for the worst-case, using a modified lat_ctx as measured
on an Ice Lake platform, which has 8 GP and 3 FP counters (ignoring
SLOTS).
lat_ctx -s 128K -N 1000 processes 2
Without the patch:
The context switch time is 4.97 us
With the patch:
The context switch time is 5.16 us
There is ~4% performance drop for the context switching time in the
worst-case.
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1623693582-187370-1-git-send-email-kan.liang@linux.intel.com
2021-06-15 01:59:42 +08:00
|
|
|
__set_bit(event->hw.idx, cpuc->dirty);
|
|
|
|
|
2014-02-24 19:26:21 +08:00
|
|
|
/*
|
|
|
|
* Not a TXN, therefore cleanup properly.
|
|
|
|
*/
|
perf: Rework the PMU methods
Replace pmu::{enable,disable,start,stop,unthrottle} with
pmu::{add,del,start,stop}, all of which take a flags argument.
The new interface extends the capability to stop a counter while
keeping it scheduled on the PMU. We replace the throttled state with
the generic stopped state.
This also allows us to efficiently stop/start counters over certain
code paths (like IRQ handlers).
It also allows scheduling a counter without it starting, allowing for
a generic frozen state (useful for rotating stopped counters).
The stopped state is implemented in two different ways, depending on
how the architecture implemented the throttled state:
1) We disable the counter:
a) the pmu has per-counter enable bits, we flip that
b) we program a NOP event, preserving the counter state
2) We store the counter state and ignore all read/overflow events
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-06-16 20:37:10 +08:00
|
|
|
x86_pmu_stop(event, PERF_EF_UPDATE);
|
2009-06-22 22:35:24 +08:00
|
|
|
|
2010-01-18 16:58:01 +08:00
|
|
|
for (i = 0; i < cpuc->n_events; i++) {
|
2014-02-24 19:26:21 +08:00
|
|
|
if (event == cpuc->event_list[i])
|
|
|
|
break;
|
|
|
|
}
|
2010-01-18 16:58:01 +08:00
|
|
|
|
2014-02-24 19:26:21 +08:00
|
|
|
if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */
|
|
|
|
return;
|
perf/x86: Fix event scheduling
Vince "Super Tester" Weaver reported a new round of syscall fuzzing (Trinity) failures,
with perf WARN_ON()s triggering. He also provided traces of the failures.
This is I think the relevant bit:
> pec_1076_warn-2804 [000] d... 147.926153: x86_pmu_disable: x86_pmu_disable
> pec_1076_warn-2804 [000] d... 147.926153: x86_pmu_state: Events: {
> pec_1076_warn-2804 [000] d... 147.926156: x86_pmu_state: 0: state: .R config: ffffffffffffffff ( (null))
> pec_1076_warn-2804 [000] d... 147.926158: x86_pmu_state: 33: state: AR config: 0 (ffff88011ac99800)
> pec_1076_warn-2804 [000] d... 147.926159: x86_pmu_state: }
> pec_1076_warn-2804 [000] d... 147.926160: x86_pmu_state: n_events: 1, n_added: 0, n_txn: 1
> pec_1076_warn-2804 [000] d... 147.926161: x86_pmu_state: Assignment: {
> pec_1076_warn-2804 [000] d... 147.926162: x86_pmu_state: 0->33 tag: 1 config: 0 (ffff88011ac99800)
> pec_1076_warn-2804 [000] d... 147.926163: x86_pmu_state: }
> pec_1076_warn-2804 [000] d... 147.926166: collect_events: Adding event: 1 (ffff880119ec8800)
So we add the insn:p event (fd[23]).
At this point we should have:
n_events = 2, n_added = 1, n_txn = 1
> pec_1076_warn-2804 [000] d... 147.926170: collect_events: Adding event: 0 (ffff8800c9e01800)
> pec_1076_warn-2804 [000] d... 147.926172: collect_events: Adding event: 4 (ffff8800cbab2c00)
We try and add the {BP,cycles,br_insn} group (fd[3], fd[4], fd[15]).
These events are 0:cycles and 4:br_insn, the BP event isn't x86_pmu so
that's not visible.
group_sched_in()
pmu->start_txn() /* nop - BP pmu */
event_sched_in()
event->pmu->add()
So here we should end up with:
0: n_events = 3, n_added = 2, n_txn = 2
4: n_events = 4, n_added = 3, n_txn = 3
But seeing the below state on x86_pmu_enable(), the must have failed,
because the 0 and 4 events aren't there anymore.
Looking at group_sched_in(), since the BP is the leader, its
event_sched_in() must have succeeded, for otherwise we would not have
seen the sibling adds.
But since neither 0 or 4 are in the below state; their event_sched_in()
must have failed; but I don't see why, the complete state: 0,0,1:p,4
fits perfectly fine on a core2.
However, since we try and schedule 4 it means the 0 event must have
succeeded! Therefore the 4 event must have failed, its failure will
have put group_sched_in() into the fail path, which will call:
event_sched_out()
event->pmu->del()
on 0 and the BP event.
Now x86_pmu_del() will reduce n_events; but it will not reduce n_added;
giving what we see below:
n_event = 2, n_added = 2, n_txn = 2
> pec_1076_warn-2804 [000] d... 147.926177: x86_pmu_enable: x86_pmu_enable
> pec_1076_warn-2804 [000] d... 147.926177: x86_pmu_state: Events: {
> pec_1076_warn-2804 [000] d... 147.926179: x86_pmu_state: 0: state: .R config: ffffffffffffffff ( (null))
> pec_1076_warn-2804 [000] d... 147.926181: x86_pmu_state: 33: state: AR config: 0 (ffff88011ac99800)
> pec_1076_warn-2804 [000] d... 147.926182: x86_pmu_state: }
> pec_1076_warn-2804 [000] d... 147.926184: x86_pmu_state: n_events: 2, n_added: 2, n_txn: 2
> pec_1076_warn-2804 [000] d... 147.926184: x86_pmu_state: Assignment: {
> pec_1076_warn-2804 [000] d... 147.926186: x86_pmu_state: 0->33 tag: 1 config: 0 (ffff88011ac99800)
> pec_1076_warn-2804 [000] d... 147.926188: x86_pmu_state: 1->0 tag: 1 config: 1 (ffff880119ec8800)
> pec_1076_warn-2804 [000] d... 147.926188: x86_pmu_state: }
> pec_1076_warn-2804 [000] d... 147.926190: x86_pmu_enable: S0: hwc->idx: 33, hwc->last_cpu: 0, hwc->last_tag: 1 hwc->state: 0
So the problem is that x86_pmu_del(), when called from a
group_sched_in() that fails (for whatever reason), and without x86_pmu
TXN support (because the leader is !x86_pmu), will corrupt the n_added
state.
Reported-and-Tested-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Dave Jones <davej@redhat.com>
Cc: <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/20140221150312.GF3104@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-02-21 23:03:12 +08:00
|
|
|
|
2014-02-24 19:26:21 +08:00
|
|
|
/* If we have a newly added event; make sure to decrease n_added. */
|
|
|
|
if (i >= cpuc->n_events - cpuc->n_added)
|
|
|
|
--cpuc->n_added;
|
2010-01-18 16:58:01 +08:00
|
|
|
|
2020-08-18 21:57:53 +08:00
|
|
|
static_call_cond(x86_pmu_put_event_constraints)(cpuc, event);
|
2014-02-24 19:26:21 +08:00
|
|
|
|
|
|
|
/* Delete the array entry. */
|
2015-05-21 16:57:13 +08:00
|
|
|
while (++i < cpuc->n_events) {
|
2014-02-24 19:26:21 +08:00
|
|
|
cpuc->event_list[i-1] = cpuc->event_list[i];
|
2015-05-21 16:57:13 +08:00
|
|
|
cpuc->event_constraint[i-1] = cpuc->event_constraint[i];
|
|
|
|
}
|
2019-03-14 20:03:00 +08:00
|
|
|
cpuc->event_constraint[i-1] = NULL;
|
2014-02-24 19:26:21 +08:00
|
|
|
--cpuc->n_events;
|
perf/x86/intel: Hybrid PMU support for perf capabilities
Some platforms, e.g. Alder Lake, have hybrid architecture. Although most
PMU capabilities are the same, there are still some unique PMU
capabilities for different hybrid PMUs. Perf should register a dedicated
pmu for each hybrid PMU.
Add a new struct x86_hybrid_pmu, which saves the dedicated pmu and
capabilities for each hybrid PMU.
The architecture MSR, MSR_IA32_PERF_CAPABILITIES, only indicates the
architecture features which are available on all hybrid PMUs. The
architecture features are stored in the global x86_pmu.intel_cap.
For Alder Lake, the model-specific features are perf metrics and
PEBS-via-PT. The corresponding bits of the global x86_pmu.intel_cap
should be 0 for these two features. Perf should not use the global
intel_cap to check the features on a hybrid system.
Add a dedicated intel_cap in the x86_hybrid_pmu to store the
model-specific capabilities. Use the dedicated intel_cap to replace
the global intel_cap for thse two features. The dedicated intel_cap
will be set in the following "Add Alder Lake Hybrid support" patch.
Add is_hybrid() to distinguish a hybrid system. ADL may have an
alternative configuration. With that configuration, the
X86_FEATURE_HYBRID_CPU is not set. Perf cannot rely on the feature bit.
Add a new static_key_false, perf_is_hybrid, to indicate a hybrid system.
It will be assigned in the following "Add Alder Lake Hybrid support"
patch as well.
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1618237865-33448-5-git-send-email-kan.liang@linux.intel.com
2021-04-12 22:30:44 +08:00
|
|
|
if (intel_cap.perf_metrics)
|
perf/x86/intel: Generic support for hardware TopDown metrics
Intro
=====
The TopDown Microarchitecture Analysis (TMA) Method is a structured
analysis methodology to identify critical performance bottlenecks in
out-of-order processors. Current perf has supported the method.
The method works well, but there is one problem. To collect the TopDown
events, several GP counters have to be used. If a user wants to collect
other events at the same time, the multiplexing probably be triggered,
which impacts the accuracy.
To free up the scarce GP counters, the hardware TopDown metrics feature
is introduced from Ice Lake. The hardware implements an additional
"metrics" register and a new Fixed Counter 3 that measures pipeline
"slots". The TopDown events can be calculated from them instead.
Events
======
The level 1 TopDown has four metrics. There is no event-code assigned to
the TopDown metrics. Four metric events are exported as separate perf
events, which map to the internal "metrics" counter register. Those
events do not exist in hardware, but can be allocated by the scheduler.
For the event mapping, a special 0x00 event code is used, which is
reserved for fake events. The metric events start from umask 0x10.
When setting up the metric events, they point to the Fixed Counter 3.
They have to be specially handled.
- Add the update_topdown_event() callback to read the additional metrics
MSR and generate the metrics.
- Add the set_topdown_event_period() callback to initialize metrics MSR
and the fixed counter 3.
- Add a variable n_metric_event to track the number of the accepted
metrics events. The sharing between multiple users of the same metric
without multiplexing is not allowed.
- Only enable/disable the fixed counter 3 when there are no other active
TopDown events, which avoid the unnecessary writing of the fixed
control register.
- Disable the PMU when reading the metrics event. The metrics MSR and
the fixed counter 3 are read separately. The values may be modified by
an NMI.
All four metric events don't support sampling. Since they will be
handled specially for event update, a flag PERF_X86_EVENT_TOPDOWN is
introduced to indicate this case.
The slots event can support both sampling and counting.
For counting, the flag is also applied.
For sampling, it will be handled normally as other normal events.
Groups
======
The slots event is required in a Topdown group.
To avoid reading the METRICS register multiple times, the metrics and
slots value can only be updated by slots event in a group.
All active slots and metrics events will be updated one time.
Therefore, the slots event must be before any metric events in a Topdown
group.
NMI
======
The METRICS related register may be overflow. The bit 48 of the STATUS
register will be set. If so, PERF_METRICS and Fixed counter 3 are
required to be reset. The patch also update all active slots and
metrics events in the NMI handler.
The update_topdown_event() has to read two registers separately. The
values may be modified by an NMI. PMU has to be disabled before calling
the function.
RDPMC
======
RDPMC is temporarily disabled. A later patch will enable it.
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200723171117.9918-9-kan.liang@linux.intel.com
2020-07-24 01:11:11 +08:00
|
|
|
del_nr_metric_event(cpuc, event);
|
2010-01-18 16:58:01 +08:00
|
|
|
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
perf_event_update_userpage(event);
|
perf/x86: Ensure perf_sched_cb_{inc,dec}() is only called from pmu::{add,del}()
Currently perf_sched_cb_{inc,dec}() are called from
pmu::{start,stop}(), which has the problem that this can happen from
NMI context, this is making it hard to optimize perf_pmu_sched_task().
Furthermore, we really only need this accounting on pmu::{add,del}(),
so doing it from pmu::{start,stop}() is doing more work than we really
need.
Introduce x86_pmu::{add,del}() and wire up the LBR and PEBS.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-07-07 00:02:43 +08:00
|
|
|
|
|
|
|
do_del:
|
2020-08-18 21:57:53 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* This is after x86_pmu_stop(); so we disable LBRs after any
|
|
|
|
* event can need them etc..
|
|
|
|
*/
|
|
|
|
static_call_cond(x86_pmu_del)(event);
|
2008-12-03 17:39:53 +08:00
|
|
|
}
|
|
|
|
|
2011-08-31 07:41:05 +08:00
|
|
|
int x86_pmu_handle_irq(struct pt_regs *regs)
|
2009-04-29 18:47:21 +08:00
|
|
|
{
|
2009-06-11 03:02:22 +08:00
|
|
|
struct perf_sample_data data;
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
struct cpu_hw_events *cpuc;
|
|
|
|
struct perf_event *event;
|
2009-07-09 05:46:14 +08:00
|
|
|
int idx, handled = 0;
|
2009-05-15 14:26:20 +08:00
|
|
|
u64 val;
|
|
|
|
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
cpuc = this_cpu_ptr(&cpu_hw_events);
|
2009-05-13 19:21:36 +08:00
|
|
|
|
2011-04-27 18:32:33 +08:00
|
|
|
/*
|
|
|
|
* Some chipsets need to unmask the LVTPC in a particular spot
|
|
|
|
* inside the nmi handler. As a result, the unmasking was pushed
|
|
|
|
* into all the nmi handlers.
|
|
|
|
*
|
|
|
|
* This generic handler doesn't seem to have any issues where the
|
|
|
|
* unmasking occurs so it was left at the top.
|
|
|
|
*/
|
|
|
|
apic_write(APIC_LVTPC, APIC_DM_NMI);
|
|
|
|
|
2010-03-30 00:36:50 +08:00
|
|
|
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
|
2019-04-02 23:21:18 +08:00
|
|
|
if (!test_bit(idx, cpuc->active_mask))
|
2009-04-29 18:47:21 +08:00
|
|
|
continue;
|
2009-05-13 19:21:36 +08:00
|
|
|
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
event = cpuc->events[idx];
|
2009-05-14 20:52:17 +08:00
|
|
|
|
2022-05-11 03:27:22 +08:00
|
|
|
val = static_call(x86_pmu_update)(event);
|
2010-03-30 00:36:50 +08:00
|
|
|
if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
|
2009-05-25 23:39:04 +08:00
|
|
|
continue;
|
2009-05-13 19:21:36 +08:00
|
|
|
|
2009-06-11 03:34:59 +08:00
|
|
|
/*
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
* event overflow
|
2009-06-11 03:34:59 +08:00
|
|
|
*/
|
perf, x86: Try to handle unknown nmis with an enabled PMU
When the PMU is enabled it is valid to have unhandled nmis, two
events could trigger 'simultaneously' raising two back-to-back
NMIs. If the first NMI handles both, the latter will be empty
and daze the CPU.
The solution to avoid an 'unknown nmi' massage in this case was
simply to stop the nmi handler chain when the PMU is enabled by
stating the nmi was handled. This has the drawback that a) we
can not detect unknown nmis anymore, and b) subsequent nmi
handlers are not called.
This patch addresses this. Now, we check this unknown NMI if it
could be a PMU back-to-back NMI. Otherwise we pass it and let
the kernel handle the unknown nmi.
This is a debug log:
cpu #6, nmi #32333, skip_nmi #32330, handled = 1, time = 1934364430
cpu #6, nmi #32334, skip_nmi #32330, handled = 1, time = 1934704616
cpu #6, nmi #32335, skip_nmi #32336, handled = 2, time = 1936032320
cpu #6, nmi #32336, skip_nmi #32336, handled = 0, time = 1936034139
cpu #6, nmi #32337, skip_nmi #32336, handled = 1, time = 1936120100
cpu #6, nmi #32338, skip_nmi #32336, handled = 1, time = 1936404607
cpu #6, nmi #32339, skip_nmi #32336, handled = 1, time = 1937983416
cpu #6, nmi #32340, skip_nmi #32341, handled = 2, time = 1938201032
cpu #6, nmi #32341, skip_nmi #32341, handled = 0, time = 1938202830
cpu #6, nmi #32342, skip_nmi #32341, handled = 1, time = 1938443743
cpu #6, nmi #32343, skip_nmi #32341, handled = 1, time = 1939956552
cpu #6, nmi #32344, skip_nmi #32341, handled = 1, time = 1940073224
cpu #6, nmi #32345, skip_nmi #32341, handled = 1, time = 1940485677
cpu #6, nmi #32346, skip_nmi #32347, handled = 2, time = 1941947772
cpu #6, nmi #32347, skip_nmi #32347, handled = 1, time = 1941949818
cpu #6, nmi #32348, skip_nmi #32347, handled = 0, time = 1941951591
Uhhuh. NMI received for unknown reason 00 on CPU 6.
Do you have a strange power saving mode enabled?
Dazed and confused, but trying to continue
Deltas:
nmi #32334 340186
nmi #32335 1327704
nmi #32336 1819 <<<< back-to-back nmi [1]
nmi #32337 85961
nmi #32338 284507
nmi #32339 1578809
nmi #32340 217616
nmi #32341 1798 <<<< back-to-back nmi [2]
nmi #32342 240913
nmi #32343 1512809
nmi #32344 116672
nmi #32345 412453
nmi #32346 1462095 <<<< 1st nmi (standard) handling 2 counters
nmi #32347 2046 <<<< 2nd nmi (back-to-back) handling one
counter nmi #32348 1773 <<<< 3rd nmi (back-to-back)
handling no counter! [3]
For back-to-back nmi detection there are the following rules:
The PMU nmi handler was handling more than one counter and no
counter was handled in the subsequent nmi (see [1] and [2]
above).
There is another case if there are two subsequent back-to-back
nmis [3]. The 2nd is detected as back-to-back because the first
handled more than one counter. If the second handles one counter
and the 3rd handles nothing, we drop the 3rd nmi because it
could be a back-to-back nmi.
Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
[ renamed nmi variable to pmu_nmi to avoid clash with .nmi in entry.S ]
Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: peterz@infradead.org
Cc: gorcunov@gmail.com
Cc: fweisbec@gmail.com
Cc: ying.huang@intel.com
Cc: ming.m.lin@intel.com
Cc: eranian@google.com
LKML-Reference: <1283454469-1909-3-git-send-email-dzickus@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-09-03 03:07:48 +08:00
|
|
|
handled++;
|
2009-06-11 03:34:59 +08:00
|
|
|
|
2022-05-11 03:27:22 +08:00
|
|
|
if (!static_call(x86_pmu_set_period)(event))
|
2009-06-02 22:08:20 +08:00
|
|
|
continue;
|
|
|
|
|
perf/x86/amd: Add AMD Fam19h Branch Sampling support
Add support for the AMD Fam19h 16-deep branch sampling feature as
described in the AMD PPR Fam19h Model 01h Revision B1. This is a model
specific extension. It is not an architected AMD feature.
The Branch Sampling (BRS) operates with a 16-deep saturating buffer in MSR
registers. There is no branch type filtering. All control flow changes are
captured. BRS relies on specific programming of the core PMU of Fam19h. In
particular, the following requirements must be met:
- the sampling period be greater than 16 (BRS depth)
- the sampling period must use a fixed and not frequency mode
BRS interacts with the NMI interrupt as well. Because enabling BRS is
expensive, it is only activated after P event occurrences, where P is the
desired sampling period. At P occurrences of the event, the counter
overflows, the CPU catches the interrupt, activates BRS for 16 branches until
it saturates, and then delivers the NMI to the kernel. Between the overflow
and the time BRS activates more branches may be executed skewing the period.
All along, the sampling event keeps counting. The skid may be attenuated by
reducing the sampling period by 16 (subsequent patch).
BRS is integrated into perf_events seamlessly via the same
PERF_RECORD_BRANCH_STACK sample format. BRS generates perf_branch_entry
records in the sampling buffer. No prediction information is supported. The
branches are stored in reverse order of execution. The most recent branch is
the first entry in each record.
No modification to the perf tool is necessary.
BRS can be used with any sampling event. However, it is recommended to use
the RETIRED_BRANCH_INSTRUCTIONS event because it matches what the BRS
captures.
$ perf record -b -c 1000037 -e cpu/event=0xc2,name=ret_br_instructions/ test
$ perf report -D
56531696056126 0x193c000 [0x1a8]: PERF_RECORD_SAMPLE(IP, 0x2): 18122/18230: 0x401d24 period: 1000037 addr: 0
... branch stack: nr:16
..... 0: 0000000000401d24 -> 0000000000401d5a 0 cycles 0
..... 1: 0000000000401d5c -> 0000000000401d24 0 cycles 0
..... 2: 0000000000401d22 -> 0000000000401d5c 0 cycles 0
..... 3: 0000000000401d5e -> 0000000000401d22 0 cycles 0
..... 4: 0000000000401d20 -> 0000000000401d5e 0 cycles 0
..... 5: 0000000000401d3e -> 0000000000401d20 0 cycles 0
..... 6: 0000000000401d42 -> 0000000000401d3e 0 cycles 0
..... 7: 0000000000401d3c -> 0000000000401d42 0 cycles 0
..... 8: 0000000000401d44 -> 0000000000401d3c 0 cycles 0
..... 9: 0000000000401d3a -> 0000000000401d44 0 cycles 0
..... 10: 0000000000401d46 -> 0000000000401d3a 0 cycles 0
..... 11: 0000000000401d38 -> 0000000000401d46 0 cycles 0
..... 12: 0000000000401d48 -> 0000000000401d38 0 cycles 0
..... 13: 0000000000401d36 -> 0000000000401d48 0 cycles 0
..... 14: 0000000000401d4a -> 0000000000401d36 0 cycles 0
..... 15: 0000000000401d34 -> 0000000000401d4a 0 cycles 0
... thread: test:18230
...... dso: test
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220322221517.2510440-4-eranian@google.com
2022-03-23 06:15:07 +08:00
|
|
|
perf_sample_data_init(&data, 0, event->hw.last_period);
|
|
|
|
|
2023-04-27 11:05:27 +08:00
|
|
|
if (has_branch_stack(event))
|
|
|
|
perf_sample_save_brstack(&data, event, &cpuc->lbr_stack);
|
perf/x86/amd: Add AMD Fam19h Branch Sampling support
Add support for the AMD Fam19h 16-deep branch sampling feature as
described in the AMD PPR Fam19h Model 01h Revision B1. This is a model
specific extension. It is not an architected AMD feature.
The Branch Sampling (BRS) operates with a 16-deep saturating buffer in MSR
registers. There is no branch type filtering. All control flow changes are
captured. BRS relies on specific programming of the core PMU of Fam19h. In
particular, the following requirements must be met:
- the sampling period be greater than 16 (BRS depth)
- the sampling period must use a fixed and not frequency mode
BRS interacts with the NMI interrupt as well. Because enabling BRS is
expensive, it is only activated after P event occurrences, where P is the
desired sampling period. At P occurrences of the event, the counter
overflows, the CPU catches the interrupt, activates BRS for 16 branches until
it saturates, and then delivers the NMI to the kernel. Between the overflow
and the time BRS activates more branches may be executed skewing the period.
All along, the sampling event keeps counting. The skid may be attenuated by
reducing the sampling period by 16 (subsequent patch).
BRS is integrated into perf_events seamlessly via the same
PERF_RECORD_BRANCH_STACK sample format. BRS generates perf_branch_entry
records in the sampling buffer. No prediction information is supported. The
branches are stored in reverse order of execution. The most recent branch is
the first entry in each record.
No modification to the perf tool is necessary.
BRS can be used with any sampling event. However, it is recommended to use
the RETIRED_BRANCH_INSTRUCTIONS event because it matches what the BRS
captures.
$ perf record -b -c 1000037 -e cpu/event=0xc2,name=ret_br_instructions/ test
$ perf report -D
56531696056126 0x193c000 [0x1a8]: PERF_RECORD_SAMPLE(IP, 0x2): 18122/18230: 0x401d24 period: 1000037 addr: 0
... branch stack: nr:16
..... 0: 0000000000401d24 -> 0000000000401d5a 0 cycles 0
..... 1: 0000000000401d5c -> 0000000000401d24 0 cycles 0
..... 2: 0000000000401d22 -> 0000000000401d5c 0 cycles 0
..... 3: 0000000000401d5e -> 0000000000401d22 0 cycles 0
..... 4: 0000000000401d20 -> 0000000000401d5e 0 cycles 0
..... 5: 0000000000401d3e -> 0000000000401d20 0 cycles 0
..... 6: 0000000000401d42 -> 0000000000401d3e 0 cycles 0
..... 7: 0000000000401d3c -> 0000000000401d42 0 cycles 0
..... 8: 0000000000401d44 -> 0000000000401d3c 0 cycles 0
..... 9: 0000000000401d3a -> 0000000000401d44 0 cycles 0
..... 10: 0000000000401d46 -> 0000000000401d3a 0 cycles 0
..... 11: 0000000000401d38 -> 0000000000401d46 0 cycles 0
..... 12: 0000000000401d48 -> 0000000000401d38 0 cycles 0
..... 13: 0000000000401d36 -> 0000000000401d48 0 cycles 0
..... 14: 0000000000401d4a -> 0000000000401d36 0 cycles 0
..... 15: 0000000000401d34 -> 0000000000401d4a 0 cycles 0
... thread: test:18230
...... dso: test
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220322221517.2510440-4-eranian@google.com
2022-03-23 06:15:07 +08:00
|
|
|
|
2011-06-27 20:41:57 +08:00
|
|
|
if (perf_event_overflow(event, &data, regs))
|
perf: Rework the PMU methods
Replace pmu::{enable,disable,start,stop,unthrottle} with
pmu::{add,del,start,stop}, all of which take a flags argument.
The new interface extends the capability to stop a counter while
keeping it scheduled on the PMU. We replace the throttled state with
the generic stopped state.
This also allows us to efficiently stop/start counters over certain
code paths (like IRQ handlers).
It also allows scheduling a counter without it starting, allowing for
a generic frozen state (useful for rotating stopped counters).
The stopped state is implemented in two different ways, depending on
how the architecture implemented the throttled state:
1) We disable the counter:
a) the pmu has per-counter enable bits, we flip that
b) we program a NOP event, preserving the counter state
2) We store the counter state and ignore all read/overflow events
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-06-16 20:37:10 +08:00
|
|
|
x86_pmu_stop(event, 0);
|
2009-04-29 18:47:21 +08:00
|
|
|
}
|
2009-05-13 19:21:36 +08:00
|
|
|
|
2009-06-11 03:34:59 +08:00
|
|
|
if (handled)
|
|
|
|
inc_irq_stat(apic_perf_irqs);
|
|
|
|
|
2009-04-29 18:47:21 +08:00
|
|
|
return handled;
|
|
|
|
}
|
2009-04-29 18:47:05 +08:00
|
|
|
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
void perf_events_lapic_init(void)
|
2008-12-03 17:39:53 +08:00
|
|
|
{
|
2009-08-11 16:40:08 +08:00
|
|
|
if (!x86_pmu.apic || !x86_pmu_initialized())
|
2008-12-03 17:39:53 +08:00
|
|
|
return;
|
2009-04-29 18:47:20 +08:00
|
|
|
|
2008-12-03 17:39:53 +08:00
|
|
|
/*
|
2009-05-29 13:28:35 +08:00
|
|
|
* Always use NMI for PMU
|
2008-12-03 17:39:53 +08:00
|
|
|
*/
|
2009-05-29 13:28:35 +08:00
|
|
|
apic_write(APIC_LVTPC, APIC_DM_NMI);
|
2008-12-03 17:39:53 +08:00
|
|
|
}
|
|
|
|
|
2014-04-17 16:18:14 +08:00
|
|
|
static int
|
2011-10-01 03:06:21 +08:00
|
|
|
perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
|
2008-12-03 17:39:53 +08:00
|
|
|
{
|
2013-06-21 23:51:36 +08:00
|
|
|
u64 start_clock;
|
|
|
|
u64 finish_clock;
|
perf/x86: Fix NMI measurements
OK, so what I'm actually seeing on my WSM is that sched/clock.c is
'broken' for the purpose we're using it for.
What triggered it is that my WSM-EP is broken :-(
[ 0.001000] tsc: Fast TSC calibration using PIT
[ 0.002000] tsc: Detected 2533.715 MHz processor
[ 0.500180] TSC synchronization [CPU#0 -> CPU#6]:
[ 0.505197] Measured 3 cycles TSC warp between CPUs, turning off TSC clock.
[ 0.004000] tsc: Marking TSC unstable due to check_tsc_sync_source failed
For some reason it consistently detects TSC skew, even though NHM+
should have a single clock domain for 'reasonable' systems.
This marks sched_clock_stable=0, which means that we do fancy stuff to
try and get a 'sane' clock. Part of this fancy stuff relies on the tick,
clearly that's gone when NOHZ=y. So for idle cpus time gets stuck, until
it either wakes up or gets kicked by another cpu.
While this is perfectly fine for the scheduler -- it only cares about
actually running stuff, and when we're running stuff we're obviously not
idle. This does somewhat break down for perf which can trigger events
just fine on an otherwise idle cpu.
So I've got NMIs get get 'measured' as taking ~1ms, which actually
don't last nearly that long:
<idle>-0 [013] d.h. 886.311970: rcu_nmi_enter <-do_nmi
...
<idle>-0 [013] d.h. 886.311997: perf_sample_event_took: HERE!!! : 1040990
So ftrace (which uses sched_clock(), not the fancy bits) only sees
~27us, but we measure ~1ms !!
Now since all this measurement stuff lives in x86 code, we can actually
fix it.
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: mingo@kernel.org
Cc: dave.hansen@linux.intel.com
Cc: eranian@google.com
Cc: Don Zickus <dzickus@redhat.com>
Cc: jmario@redhat.com
Cc: acme@infradead.org
Link: http://lkml.kernel.org/r/20131017133350.GG3364@laptop.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-10-17 21:32:10 +08:00
|
|
|
int ret;
|
2013-06-21 23:51:36 +08:00
|
|
|
|
2015-06-09 18:03:26 +08:00
|
|
|
/*
|
|
|
|
* All PMUs/events that share this PMI handler should make sure to
|
|
|
|
* increment active_events for their events.
|
|
|
|
*/
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
if (!atomic_read(&active_events))
|
2011-10-01 03:06:21 +08:00
|
|
|
return NMI_DONE;
|
perf, x86: Try to handle unknown nmis with an enabled PMU
When the PMU is enabled it is valid to have unhandled nmis, two
events could trigger 'simultaneously' raising two back-to-back
NMIs. If the first NMI handles both, the latter will be empty
and daze the CPU.
The solution to avoid an 'unknown nmi' massage in this case was
simply to stop the nmi handler chain when the PMU is enabled by
stating the nmi was handled. This has the drawback that a) we
can not detect unknown nmis anymore, and b) subsequent nmi
handlers are not called.
This patch addresses this. Now, we check this unknown NMI if it
could be a PMU back-to-back NMI. Otherwise we pass it and let
the kernel handle the unknown nmi.
This is a debug log:
cpu #6, nmi #32333, skip_nmi #32330, handled = 1, time = 1934364430
cpu #6, nmi #32334, skip_nmi #32330, handled = 1, time = 1934704616
cpu #6, nmi #32335, skip_nmi #32336, handled = 2, time = 1936032320
cpu #6, nmi #32336, skip_nmi #32336, handled = 0, time = 1936034139
cpu #6, nmi #32337, skip_nmi #32336, handled = 1, time = 1936120100
cpu #6, nmi #32338, skip_nmi #32336, handled = 1, time = 1936404607
cpu #6, nmi #32339, skip_nmi #32336, handled = 1, time = 1937983416
cpu #6, nmi #32340, skip_nmi #32341, handled = 2, time = 1938201032
cpu #6, nmi #32341, skip_nmi #32341, handled = 0, time = 1938202830
cpu #6, nmi #32342, skip_nmi #32341, handled = 1, time = 1938443743
cpu #6, nmi #32343, skip_nmi #32341, handled = 1, time = 1939956552
cpu #6, nmi #32344, skip_nmi #32341, handled = 1, time = 1940073224
cpu #6, nmi #32345, skip_nmi #32341, handled = 1, time = 1940485677
cpu #6, nmi #32346, skip_nmi #32347, handled = 2, time = 1941947772
cpu #6, nmi #32347, skip_nmi #32347, handled = 1, time = 1941949818
cpu #6, nmi #32348, skip_nmi #32347, handled = 0, time = 1941951591
Uhhuh. NMI received for unknown reason 00 on CPU 6.
Do you have a strange power saving mode enabled?
Dazed and confused, but trying to continue
Deltas:
nmi #32334 340186
nmi #32335 1327704
nmi #32336 1819 <<<< back-to-back nmi [1]
nmi #32337 85961
nmi #32338 284507
nmi #32339 1578809
nmi #32340 217616
nmi #32341 1798 <<<< back-to-back nmi [2]
nmi #32342 240913
nmi #32343 1512809
nmi #32344 116672
nmi #32345 412453
nmi #32346 1462095 <<<< 1st nmi (standard) handling 2 counters
nmi #32347 2046 <<<< 2nd nmi (back-to-back) handling one
counter nmi #32348 1773 <<<< 3rd nmi (back-to-back)
handling no counter! [3]
For back-to-back nmi detection there are the following rules:
The PMU nmi handler was handling more than one counter and no
counter was handled in the subsequent nmi (see [1] and [2]
above).
There is another case if there are two subsequent back-to-back
nmis [3]. The 2nd is detected as back-to-back because the first
handled more than one counter. If the second handles one counter
and the 3rd handles nothing, we drop the 3rd nmi because it
could be a back-to-back nmi.
Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
[ renamed nmi variable to pmu_nmi to avoid clash with .nmi in entry.S ]
Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: peterz@infradead.org
Cc: gorcunov@gmail.com
Cc: fweisbec@gmail.com
Cc: ying.huang@intel.com
Cc: ming.m.lin@intel.com
Cc: eranian@google.com
LKML-Reference: <1283454469-1909-3-git-send-email-dzickus@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-09-03 03:07:48 +08:00
|
|
|
|
perf/x86: Fix NMI measurements
OK, so what I'm actually seeing on my WSM is that sched/clock.c is
'broken' for the purpose we're using it for.
What triggered it is that my WSM-EP is broken :-(
[ 0.001000] tsc: Fast TSC calibration using PIT
[ 0.002000] tsc: Detected 2533.715 MHz processor
[ 0.500180] TSC synchronization [CPU#0 -> CPU#6]:
[ 0.505197] Measured 3 cycles TSC warp between CPUs, turning off TSC clock.
[ 0.004000] tsc: Marking TSC unstable due to check_tsc_sync_source failed
For some reason it consistently detects TSC skew, even though NHM+
should have a single clock domain for 'reasonable' systems.
This marks sched_clock_stable=0, which means that we do fancy stuff to
try and get a 'sane' clock. Part of this fancy stuff relies on the tick,
clearly that's gone when NOHZ=y. So for idle cpus time gets stuck, until
it either wakes up or gets kicked by another cpu.
While this is perfectly fine for the scheduler -- it only cares about
actually running stuff, and when we're running stuff we're obviously not
idle. This does somewhat break down for perf which can trigger events
just fine on an otherwise idle cpu.
So I've got NMIs get get 'measured' as taking ~1ms, which actually
don't last nearly that long:
<idle>-0 [013] d.h. 886.311970: rcu_nmi_enter <-do_nmi
...
<idle>-0 [013] d.h. 886.311997: perf_sample_event_took: HERE!!! : 1040990
So ftrace (which uses sched_clock(), not the fancy bits) only sees
~27us, but we measure ~1ms !!
Now since all this measurement stuff lives in x86 code, we can actually
fix it.
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: mingo@kernel.org
Cc: dave.hansen@linux.intel.com
Cc: eranian@google.com
Cc: Don Zickus <dzickus@redhat.com>
Cc: jmario@redhat.com
Cc: acme@infradead.org
Link: http://lkml.kernel.org/r/20131017133350.GG3364@laptop.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-10-17 21:32:10 +08:00
|
|
|
start_clock = sched_clock();
|
2020-08-18 21:57:53 +08:00
|
|
|
ret = static_call(x86_pmu_handle_irq)(regs);
|
perf/x86: Fix NMI measurements
OK, so what I'm actually seeing on my WSM is that sched/clock.c is
'broken' for the purpose we're using it for.
What triggered it is that my WSM-EP is broken :-(
[ 0.001000] tsc: Fast TSC calibration using PIT
[ 0.002000] tsc: Detected 2533.715 MHz processor
[ 0.500180] TSC synchronization [CPU#0 -> CPU#6]:
[ 0.505197] Measured 3 cycles TSC warp between CPUs, turning off TSC clock.
[ 0.004000] tsc: Marking TSC unstable due to check_tsc_sync_source failed
For some reason it consistently detects TSC skew, even though NHM+
should have a single clock domain for 'reasonable' systems.
This marks sched_clock_stable=0, which means that we do fancy stuff to
try and get a 'sane' clock. Part of this fancy stuff relies on the tick,
clearly that's gone when NOHZ=y. So for idle cpus time gets stuck, until
it either wakes up or gets kicked by another cpu.
While this is perfectly fine for the scheduler -- it only cares about
actually running stuff, and when we're running stuff we're obviously not
idle. This does somewhat break down for perf which can trigger events
just fine on an otherwise idle cpu.
So I've got NMIs get get 'measured' as taking ~1ms, which actually
don't last nearly that long:
<idle>-0 [013] d.h. 886.311970: rcu_nmi_enter <-do_nmi
...
<idle>-0 [013] d.h. 886.311997: perf_sample_event_took: HERE!!! : 1040990
So ftrace (which uses sched_clock(), not the fancy bits) only sees
~27us, but we measure ~1ms !!
Now since all this measurement stuff lives in x86 code, we can actually
fix it.
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: mingo@kernel.org
Cc: dave.hansen@linux.intel.com
Cc: eranian@google.com
Cc: Don Zickus <dzickus@redhat.com>
Cc: jmario@redhat.com
Cc: acme@infradead.org
Link: http://lkml.kernel.org/r/20131017133350.GG3364@laptop.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-10-17 21:32:10 +08:00
|
|
|
finish_clock = sched_clock();
|
2013-06-21 23:51:36 +08:00
|
|
|
|
|
|
|
perf_sample_event_took(finish_clock - start_clock);
|
|
|
|
|
|
|
|
return ret;
|
2008-12-03 17:39:53 +08:00
|
|
|
}
|
2014-04-17 16:18:14 +08:00
|
|
|
NOKPROBE_SYMBOL(perf_event_nmi_handler);
|
2008-12-03 17:39:53 +08:00
|
|
|
|
2011-08-31 07:41:05 +08:00
|
|
|
struct event_constraint emptyconstraint;
|
|
|
|
struct event_constraint unconstrained;
|
x86: AMD Support for perf_counter
Supported basic performance counter for AMD K7 and later:
$ perfstat -e 0,1,2,3,4,5,-1,-2,-3,-4,-5 ls > /dev/null
Performance counter stats for 'ls':
12.298610 task clock ticks (msecs)
3298477 CPU cycles (events)
1406354 instructions (events)
749035 cache references (events)
16939 cache misses (events)
100589 branches (events)
11159 branch misses (events)
7.627540 cpu clock ticks (msecs)
12.298610 task clock ticks (msecs)
500 pagefaults (events)
6 context switches (events)
3 CPU migrations (events)
Wall-clock time elapsed: 8.672290 msecs
Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-02-27 22:45:14 +08:00
|
|
|
|
2016-07-14 01:16:10 +08:00
|
|
|
static int x86_pmu_prepare_cpu(unsigned int cpu)
|
2010-03-05 20:01:18 +08:00
|
|
|
{
|
2011-07-22 19:41:54 +08:00
|
|
|
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
|
2016-07-14 01:16:10 +08:00
|
|
|
int i;
|
2010-03-05 20:01:18 +08:00
|
|
|
|
2016-07-14 01:16:10 +08:00
|
|
|
for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
|
|
|
|
cpuc->kfree_on_online[i] = NULL;
|
|
|
|
if (x86_pmu.cpu_prepare)
|
|
|
|
return x86_pmu.cpu_prepare(cpu);
|
|
|
|
return 0;
|
|
|
|
}
|
2011-07-22 19:41:54 +08:00
|
|
|
|
2016-07-14 01:16:10 +08:00
|
|
|
static int x86_pmu_dead_cpu(unsigned int cpu)
|
|
|
|
{
|
|
|
|
if (x86_pmu.cpu_dead)
|
|
|
|
x86_pmu.cpu_dead(cpu);
|
|
|
|
return 0;
|
|
|
|
}
|
2010-03-05 20:01:18 +08:00
|
|
|
|
2016-07-14 01:16:10 +08:00
|
|
|
static int x86_pmu_online_cpu(unsigned int cpu)
|
|
|
|
{
|
|
|
|
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
|
|
|
|
int i;
|
2010-03-05 20:01:18 +08:00
|
|
|
|
2016-07-14 01:16:10 +08:00
|
|
|
for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
|
|
|
|
kfree(cpuc->kfree_on_online[i]);
|
|
|
|
cpuc->kfree_on_online[i] = NULL;
|
2010-03-05 20:01:18 +08:00
|
|
|
}
|
2016-07-14 01:16:10 +08:00
|
|
|
return 0;
|
|
|
|
}
|
2010-03-05 20:01:18 +08:00
|
|
|
|
2016-07-14 01:16:10 +08:00
|
|
|
static int x86_pmu_starting_cpu(unsigned int cpu)
|
|
|
|
{
|
|
|
|
if (x86_pmu.cpu_starting)
|
|
|
|
x86_pmu.cpu_starting(cpu);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int x86_pmu_dying_cpu(unsigned int cpu)
|
|
|
|
{
|
|
|
|
if (x86_pmu.cpu_dying)
|
|
|
|
x86_pmu.cpu_dying(cpu);
|
|
|
|
return 0;
|
2010-03-05 20:01:18 +08:00
|
|
|
}
|
|
|
|
|
2009-12-11 00:56:34 +08:00
|
|
|
static void __init pmu_check_apic(void)
|
|
|
|
{
|
2016-04-05 04:25:00 +08:00
|
|
|
if (boot_cpu_has(X86_FEATURE_APIC))
|
2009-12-11 00:56:34 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
x86_pmu.apic = 0;
|
|
|
|
pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
|
|
|
|
pr_info("no hardware sampling interrupt available.\n");
|
2014-05-17 05:18:07 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we have a PMU initialized but no APIC
|
|
|
|
* interrupts, we cannot sample hardware
|
|
|
|
* events (user-space has to fall back and
|
|
|
|
* sample via a hrtimer based software event):
|
|
|
|
*/
|
|
|
|
pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
|
|
|
|
|
2009-12-11 00:56:34 +08:00
|
|
|
}
|
|
|
|
|
2018-08-10 23:43:14 +08:00
|
|
|
static struct attribute_group x86_pmu_format_group __ro_after_init = {
|
2012-03-16 03:09:14 +08:00
|
|
|
.name = "format",
|
|
|
|
.attrs = NULL,
|
|
|
|
};
|
|
|
|
|
2016-03-09 13:45:06 +08:00
|
|
|
ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page)
|
2012-10-10 20:53:11 +08:00
|
|
|
{
|
2019-12-06 19:50:16 +08:00
|
|
|
struct perf_pmu_events_attr *pmu_attr =
|
2012-10-10 20:53:11 +08:00
|
|
|
container_of(attr, struct perf_pmu_events_attr, attr);
|
2019-12-06 19:50:16 +08:00
|
|
|
u64 config = 0;
|
|
|
|
|
|
|
|
if (pmu_attr->id < x86_pmu.max_events)
|
|
|
|
config = x86_pmu.event_map(pmu_attr->id);
|
2012-10-10 20:53:11 +08:00
|
|
|
|
2013-01-24 23:10:26 +08:00
|
|
|
/* string trumps id */
|
|
|
|
if (pmu_attr->event_str)
|
perf/x86: Unify format of events sysfs show
Sysfs show formats of files in /sys/devices/cpu/events/ are not unified,
some end with "\n", and some do not. Modify sysfs show format of events
defined by EVENT_ATTR_STR to end with "\n".
Before:
$ ls /sys/devices/cpu/events/* | xargs -i sh -c 'echo -n "{}: "; cat -A {}; echo'
branch-instructions: event=0xc4$
branch-misses: event=0xc5$
bus-cycles: event=0x3c,umask=0x01$
cache-misses: event=0x2e,umask=0x41$
cache-references: event=0x2e,umask=0x4f$
cpu-cycles: event=0x3c$
instructions: event=0xc0$
ref-cycles: event=0x00,umask=0x03$
slots: event=0x00,umask=0x4
topdown-bad-spec: event=0x00,umask=0x81
topdown-be-bound: event=0x00,umask=0x83
topdown-fe-bound: event=0x00,umask=0x82
topdown-retiring: event=0x00,umask=0x80
After:
$ ls /sys/devices/cpu/events/* | xargs -i sh -c 'echo -n "{}: "; cat -A {}; echo'
/sys/devices/cpu/events/branch-instructions: event=0xc4$
/sys/devices/cpu/events/branch-misses: event=0xc5$
/sys/devices/cpu/events/bus-cycles: event=0x3c,umask=0x01$
/sys/devices/cpu/events/cache-misses: event=0x2e,umask=0x41$
/sys/devices/cpu/events/cache-references: event=0x2e,umask=0x4f$
/sys/devices/cpu/events/cpu-cycles: event=0x3c$
/sys/devices/cpu/events/instructions: event=0xc0$
/sys/devices/cpu/events/ref-cycles: event=0x00,umask=0x03$
/sys/devices/cpu/events/slots: event=0x00,umask=0x4$
Signed-off-by: Yang Jihong <yangjihong1@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20220324031957.135595-1-yangjihong1@huawei.com
2022-03-24 11:19:57 +08:00
|
|
|
return sprintf(page, "%s\n", pmu_attr->event_str);
|
2012-10-10 20:53:11 +08:00
|
|
|
|
2013-01-24 23:10:26 +08:00
|
|
|
return x86_pmu.events_sysfs_show(page, config);
|
|
|
|
}
|
2016-03-09 13:45:06 +08:00
|
|
|
EXPORT_SYMBOL_GPL(events_sysfs_show);
|
2012-10-10 20:53:11 +08:00
|
|
|
|
2016-05-20 08:09:56 +08:00
|
|
|
ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr,
|
|
|
|
char *page)
|
|
|
|
{
|
|
|
|
struct perf_pmu_events_ht_attr *pmu_attr =
|
|
|
|
container_of(attr, struct perf_pmu_events_ht_attr, attr);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Report conditional events depending on Hyper-Threading.
|
|
|
|
*
|
|
|
|
* This is overly conservative as usually the HT special
|
|
|
|
* handling is not needed if the other CPU thread is idle.
|
|
|
|
*
|
|
|
|
* Note this does not (and cannot) handle the case when thread
|
|
|
|
* siblings are invisible, for example with virtualization
|
|
|
|
* if they are owned by some other guest. The user tool
|
|
|
|
* has to re-read when a thread sibling gets onlined later.
|
|
|
|
*/
|
|
|
|
return sprintf(page, "%s",
|
|
|
|
topology_max_smt_threads() > 1 ?
|
|
|
|
pmu_attr->event_str_ht :
|
|
|
|
pmu_attr->event_str_noht);
|
|
|
|
}
|
|
|
|
|
perf/x86: Add structures for the attributes of Hybrid PMUs
Hybrid PMUs have different events and formats. In theory, Hybrid PMU
specific attributes should be maintained in the dedicated struct
x86_hybrid_pmu, but it wastes space because the events and formats are
similar among Hybrid PMUs.
To reduce duplication, all hybrid PMUs will share a group of attributes
in the following patch. To distinguish an attribute from different
Hybrid PMUs, a PMU aware attribute structure is introduced. A PMU type
is required for the attribute structure. The type is internal usage. It
is not visible in the sysfs API.
Hybrid PMUs may support the same event name, but with different event
encoding, e.g., the mem-loads event on an Atom PMU has different event
encoding from a Core PMU. It brings issue if two attributes are
created for them. Current sysfs_update_group finds an attribute by
searching the attr name (aka event name). If two attributes have the
same event name, the first attribute will be replaced.
To address the issue, only one attribute is created for the event. The
event_str is extended and stores event encodings from all Hybrid PMUs.
Each event encoding is divided by ";". The order of the event encodings
must follow the order of the hybrid PMU index. The event_str is internal
usage as well. When a user wants to show the attribute of a Hybrid PMU,
only the corresponding part of the string is displayed.
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Link: https://lkml.kernel.org/r/1618237865-33448-18-git-send-email-kan.liang@linux.intel.com
2021-04-12 22:30:57 +08:00
|
|
|
ssize_t events_hybrid_sysfs_show(struct device *dev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
char *page)
|
|
|
|
{
|
|
|
|
struct perf_pmu_events_hybrid_attr *pmu_attr =
|
|
|
|
container_of(attr, struct perf_pmu_events_hybrid_attr, attr);
|
|
|
|
struct x86_hybrid_pmu *pmu;
|
|
|
|
const char *str, *next_str;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (hweight64(pmu_attr->pmu_type) == 1)
|
|
|
|
return sprintf(page, "%s", pmu_attr->event_str);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Hybrid PMUs may support the same event name, but with different
|
|
|
|
* event encoding, e.g., the mem-loads event on an Atom PMU has
|
|
|
|
* different event encoding from a Core PMU.
|
|
|
|
*
|
|
|
|
* The event_str includes all event encodings. Each event encoding
|
|
|
|
* is divided by ";". The order of the event encodings must follow
|
|
|
|
* the order of the hybrid PMU index.
|
|
|
|
*/
|
|
|
|
pmu = container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
|
|
|
|
|
|
|
|
str = pmu_attr->event_str;
|
|
|
|
for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
|
|
|
|
if (!(x86_pmu.hybrid_pmu[i].cpu_type & pmu_attr->pmu_type))
|
|
|
|
continue;
|
|
|
|
if (x86_pmu.hybrid_pmu[i].cpu_type & pmu->cpu_type) {
|
|
|
|
next_str = strchr(str, ';');
|
|
|
|
if (next_str)
|
|
|
|
return snprintf(page, next_str - str + 1, "%s", str);
|
|
|
|
else
|
|
|
|
return sprintf(page, "%s", str);
|
|
|
|
}
|
|
|
|
str = strchr(str, ';');
|
|
|
|
str++;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(events_hybrid_sysfs_show);
|
|
|
|
|
2012-10-10 20:53:11 +08:00
|
|
|
EVENT_ATTR(cpu-cycles, CPU_CYCLES );
|
|
|
|
EVENT_ATTR(instructions, INSTRUCTIONS );
|
|
|
|
EVENT_ATTR(cache-references, CACHE_REFERENCES );
|
|
|
|
EVENT_ATTR(cache-misses, CACHE_MISSES );
|
|
|
|
EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS );
|
|
|
|
EVENT_ATTR(branch-misses, BRANCH_MISSES );
|
|
|
|
EVENT_ATTR(bus-cycles, BUS_CYCLES );
|
|
|
|
EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND );
|
|
|
|
EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND );
|
|
|
|
EVENT_ATTR(ref-cycles, REF_CPU_CYCLES );
|
|
|
|
|
|
|
|
static struct attribute *empty_attrs;
|
|
|
|
|
2012-10-30 04:48:17 +08:00
|
|
|
static struct attribute *events_attr[] = {
|
2012-10-10 20:53:11 +08:00
|
|
|
EVENT_PTR(CPU_CYCLES),
|
|
|
|
EVENT_PTR(INSTRUCTIONS),
|
|
|
|
EVENT_PTR(CACHE_REFERENCES),
|
|
|
|
EVENT_PTR(CACHE_MISSES),
|
|
|
|
EVENT_PTR(BRANCH_INSTRUCTIONS),
|
|
|
|
EVENT_PTR(BRANCH_MISSES),
|
|
|
|
EVENT_PTR(BUS_CYCLES),
|
|
|
|
EVENT_PTR(STALLED_CYCLES_FRONTEND),
|
|
|
|
EVENT_PTR(STALLED_CYCLES_BACKEND),
|
|
|
|
EVENT_PTR(REF_CPU_CYCLES),
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2019-05-12 23:55:14 +08:00
|
|
|
/*
|
|
|
|
* Remove all undefined events (x86_pmu.event_map(id) == 0)
|
|
|
|
* out of events_attr attributes.
|
|
|
|
*/
|
|
|
|
static umode_t
|
|
|
|
is_visible(struct kobject *kobj, struct attribute *attr, int idx)
|
|
|
|
{
|
|
|
|
struct perf_pmu_events_attr *pmu_attr;
|
|
|
|
|
2019-12-06 19:50:16 +08:00
|
|
|
if (idx >= x86_pmu.max_events)
|
|
|
|
return 0;
|
|
|
|
|
2019-05-12 23:55:14 +08:00
|
|
|
pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr.attr);
|
|
|
|
/* str trumps id */
|
|
|
|
return pmu_attr->event_str || x86_pmu.event_map(idx) ? attr->mode : 0;
|
|
|
|
}
|
|
|
|
|
2018-08-10 23:43:14 +08:00
|
|
|
static struct attribute_group x86_pmu_events_group __ro_after_init = {
|
2012-10-10 20:53:11 +08:00
|
|
|
.name = "events",
|
|
|
|
.attrs = events_attr,
|
2019-05-12 23:55:14 +08:00
|
|
|
.is_visible = is_visible,
|
2012-10-10 20:53:11 +08:00
|
|
|
};
|
|
|
|
|
2012-10-10 20:53:14 +08:00
|
|
|
ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
|
2012-10-10 20:53:13 +08:00
|
|
|
{
|
|
|
|
u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
|
|
|
|
u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
|
|
|
|
bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE);
|
|
|
|
bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL);
|
|
|
|
bool any = (config & ARCH_PERFMON_EVENTSEL_ANY);
|
|
|
|
bool inv = (config & ARCH_PERFMON_EVENTSEL_INV);
|
|
|
|
ssize_t ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We have whole page size to spend and just little data
|
|
|
|
* to write, so we can safely use sprintf.
|
|
|
|
*/
|
|
|
|
ret = sprintf(page, "event=0x%02llx", event);
|
|
|
|
|
|
|
|
if (umask)
|
|
|
|
ret += sprintf(page + ret, ",umask=0x%02llx", umask);
|
|
|
|
|
|
|
|
if (edge)
|
|
|
|
ret += sprintf(page + ret, ",edge");
|
|
|
|
|
|
|
|
if (pc)
|
|
|
|
ret += sprintf(page + ret, ",pc");
|
|
|
|
|
|
|
|
if (any)
|
|
|
|
ret += sprintf(page + ret, ",any");
|
|
|
|
|
|
|
|
if (inv)
|
|
|
|
ret += sprintf(page + ret, ",inv");
|
|
|
|
|
|
|
|
if (cmask)
|
|
|
|
ret += sprintf(page + ret, ",cmask=0x%02llx", cmask);
|
|
|
|
|
|
|
|
ret += sprintf(page + ret, "\n");
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2017-05-12 22:51:13 +08:00
|
|
|
static struct attribute_group x86_pmu_attr_group;
|
2017-08-28 18:46:50 +08:00
|
|
|
static struct attribute_group x86_pmu_caps_group;
|
2017-05-12 22:51:13 +08:00
|
|
|
|
2020-08-18 21:57:53 +08:00
|
|
|
static void x86_pmu_static_call_update(void)
|
|
|
|
{
|
|
|
|
static_call_update(x86_pmu_handle_irq, x86_pmu.handle_irq);
|
|
|
|
static_call_update(x86_pmu_disable_all, x86_pmu.disable_all);
|
|
|
|
static_call_update(x86_pmu_enable_all, x86_pmu.enable_all);
|
|
|
|
static_call_update(x86_pmu_enable, x86_pmu.enable);
|
|
|
|
static_call_update(x86_pmu_disable, x86_pmu.disable);
|
|
|
|
|
2021-09-08 00:39:01 +08:00
|
|
|
static_call_update(x86_pmu_assign, x86_pmu.assign);
|
|
|
|
|
2020-08-18 21:57:53 +08:00
|
|
|
static_call_update(x86_pmu_add, x86_pmu.add);
|
|
|
|
static_call_update(x86_pmu_del, x86_pmu.del);
|
|
|
|
static_call_update(x86_pmu_read, x86_pmu.read);
|
|
|
|
|
2022-05-11 03:27:22 +08:00
|
|
|
static_call_update(x86_pmu_set_period, x86_pmu.set_period);
|
|
|
|
static_call_update(x86_pmu_update, x86_pmu.update);
|
2022-05-11 03:28:18 +08:00
|
|
|
static_call_update(x86_pmu_limit_period, x86_pmu.limit_period);
|
2022-05-11 03:27:22 +08:00
|
|
|
|
2020-08-18 21:57:53 +08:00
|
|
|
static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events);
|
|
|
|
static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints);
|
|
|
|
static_call_update(x86_pmu_put_event_constraints, x86_pmu.put_event_constraints);
|
|
|
|
|
|
|
|
static_call_update(x86_pmu_start_scheduling, x86_pmu.start_scheduling);
|
|
|
|
static_call_update(x86_pmu_commit_scheduling, x86_pmu.commit_scheduling);
|
|
|
|
static_call_update(x86_pmu_stop_scheduling, x86_pmu.stop_scheduling);
|
|
|
|
|
|
|
|
static_call_update(x86_pmu_sched_task, x86_pmu.sched_task);
|
|
|
|
static_call_update(x86_pmu_swap_task_ctx, x86_pmu.swap_task_ctx);
|
|
|
|
|
|
|
|
static_call_update(x86_pmu_drain_pebs, x86_pmu.drain_pebs);
|
|
|
|
static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases);
|
2021-01-25 20:14:58 +08:00
|
|
|
|
|
|
|
static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs);
|
perf: Rewrite core context handling
There have been various issues and limitations with the way perf uses
(task) contexts to track events. Most notable is the single hardware
PMU task context, which has resulted in a number of yucky things (both
proposed and merged).
Notably:
- HW breakpoint PMU
- ARM big.little PMU / Intel ADL PMU
- Intel Branch Monitoring PMU
- AMD IBS PMU
- S390 cpum_cf PMU
- PowerPC trace_imc PMU
*Current design:*
Currently we have a per task and per cpu perf_event_contexts:
task_struct::perf_events_ctxp[] <-> perf_event_context <-> perf_cpu_context
^ | ^ | ^
`---------------------------------' | `--> pmu ---'
v ^
perf_event ------'
Each task has an array of pointers to a perf_event_context. Each
perf_event_context has a direct relation to a PMU and a group of
events for that PMU. The task related perf_event_context's have a
pointer back to that task.
Each PMU has a per-cpu pointer to a per-cpu perf_cpu_context, which
includes a perf_event_context, which again has a direct relation to
that PMU, and a group of events for that PMU.
The perf_cpu_context also tracks which task context is currently
associated with that CPU and includes a few other things like the
hrtimer for rotation etc.
Each perf_event is then associated with its PMU and one
perf_event_context.
*Proposed design:*
New design proposed by this patch reduce to a single task context and
a single CPU context but adds some intermediate data-structures:
task_struct::perf_event_ctxp -> perf_event_context <- perf_cpu_context
^ | ^ ^
`---------------------------' | |
| | perf_cpu_pmu_context <--.
| `----. ^ |
| | | |
| v v |
| ,--> perf_event_pmu_context |
| | |
| | |
v v |
perf_event ---> pmu ----------------'
With the new design, perf_event_context will hold all events for all
pmus in the (respective pinned/flexible) rbtrees. This can be achieved
by adding pmu to rbtree key:
{cpu, pmu, cgroup, group_index}
Each perf_event_context carries a list of perf_event_pmu_context which
is used to hold per-pmu-per-context state. For example, it keeps track
of currently active events for that pmu, a pmu specific task_ctx_data,
a flag to tell whether rotation is required or not etc.
Additionally, perf_cpu_pmu_context is used to hold per-pmu-per-cpu
state like hrtimer details to drive the event rotation, a pointer to
perf_event_pmu_context of currently running task and some other
ancillary information.
Each perf_event is associated to it's pmu, perf_event_context and
perf_event_pmu_context.
Further optimizations to current implementation are possible. For
example, ctx_resched() can be optimized to reschedule only single pmu
events.
Much thanks to Ravi for picking this up and pushing it towards
completion.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Co-developed-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20221008062424.313-1-ravi.bangoria@amd.com
2022-10-08 14:24:24 +08:00
|
|
|
static_call_update(x86_pmu_filter, x86_pmu.filter);
|
2020-08-18 21:57:53 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void _x86_pmu_read(struct perf_event *event)
|
|
|
|
{
|
2022-05-11 03:27:22 +08:00
|
|
|
static_call(x86_pmu_update)(event);
|
2020-08-18 21:57:53 +08:00
|
|
|
}
|
|
|
|
|
2021-04-12 22:30:55 +08:00
|
|
|
void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed,
|
|
|
|
u64 intel_ctrl)
|
|
|
|
{
|
|
|
|
pr_info("... version: %d\n", x86_pmu.version);
|
|
|
|
pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
|
|
|
|
pr_info("... generic registers: %d\n", num_counters);
|
|
|
|
pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
|
|
|
|
pr_info("... max period: %016Lx\n", x86_pmu.max_period);
|
|
|
|
pr_info("... fixed-purpose events: %lu\n",
|
|
|
|
hweight64((((1ULL << num_counters_fixed) - 1)
|
|
|
|
<< INTEL_PMC_IDX_FIXED) & intel_ctrl));
|
|
|
|
pr_info("... event mask: %016Lx\n", intel_ctrl);
|
|
|
|
}
|
|
|
|
|
2011-01-22 07:30:01 +08:00
|
|
|
static int __init init_hw_perf_events(void)
|
2009-02-27 20:39:09 +08:00
|
|
|
{
|
2011-12-06 21:07:15 +08:00
|
|
|
struct x86_pmu_quirk *quirk;
|
2009-04-29 18:47:10 +08:00
|
|
|
int err;
|
|
|
|
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
pr_info("Performance Events: ");
|
2009-05-29 17:25:09 +08:00
|
|
|
|
2009-02-27 20:39:09 +08:00
|
|
|
switch (boot_cpu_data.x86_vendor) {
|
|
|
|
case X86_VENDOR_INTEL:
|
2009-04-29 18:47:10 +08:00
|
|
|
err = intel_pmu_init();
|
2009-02-27 20:39:09 +08:00
|
|
|
break;
|
x86: AMD Support for perf_counter
Supported basic performance counter for AMD K7 and later:
$ perfstat -e 0,1,2,3,4,5,-1,-2,-3,-4,-5 ls > /dev/null
Performance counter stats for 'ls':
12.298610 task clock ticks (msecs)
3298477 CPU cycles (events)
1406354 instructions (events)
749035 cache references (events)
16939 cache misses (events)
100589 branches (events)
11159 branch misses (events)
7.627540 cpu clock ticks (msecs)
12.298610 task clock ticks (msecs)
500 pagefaults (events)
6 context switches (events)
3 CPU migrations (events)
Wall-clock time elapsed: 8.672290 msecs
Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-02-27 22:45:14 +08:00
|
|
|
case X86_VENDOR_AMD:
|
2009-04-29 18:47:10 +08:00
|
|
|
err = amd_pmu_init();
|
x86: AMD Support for perf_counter
Supported basic performance counter for AMD K7 and later:
$ perfstat -e 0,1,2,3,4,5,-1,-2,-3,-4,-5 ls > /dev/null
Performance counter stats for 'ls':
12.298610 task clock ticks (msecs)
3298477 CPU cycles (events)
1406354 instructions (events)
749035 cache references (events)
16939 cache misses (events)
100589 branches (events)
11159 branch misses (events)
7.627540 cpu clock ticks (msecs)
12.298610 task clock ticks (msecs)
500 pagefaults (events)
6 context switches (events)
3 CPU migrations (events)
Wall-clock time elapsed: 8.672290 msecs
Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-02-27 22:45:14 +08:00
|
|
|
break;
|
2018-09-23 17:34:47 +08:00
|
|
|
case X86_VENDOR_HYGON:
|
|
|
|
err = amd_pmu_init();
|
|
|
|
x86_pmu.name = "HYGON";
|
|
|
|
break;
|
2020-04-13 11:14:29 +08:00
|
|
|
case X86_VENDOR_ZHAOXIN:
|
|
|
|
case X86_VENDOR_CENTAUR:
|
|
|
|
err = zhaoxin_pmu_init();
|
|
|
|
break;
|
2009-04-29 18:47:00 +08:00
|
|
|
default:
|
2013-09-28 21:48:48 +08:00
|
|
|
err = -ENOTSUPP;
|
2009-02-27 20:39:09 +08:00
|
|
|
}
|
2009-05-29 17:25:09 +08:00
|
|
|
if (err != 0) {
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
pr_cont("no PMU driver, software events only.\n");
|
2022-06-01 17:45:17 +08:00
|
|
|
err = 0;
|
|
|
|
goto out_bad_pmu;
|
2009-05-29 17:25:09 +08:00
|
|
|
}
|
2009-02-27 20:39:09 +08:00
|
|
|
|
2009-12-11 00:56:34 +08:00
|
|
|
pmu_check_apic();
|
|
|
|
|
2010-11-23 05:55:23 +08:00
|
|
|
/* sanity check that the hardware exists or is emulated */
|
2021-04-12 22:30:45 +08:00
|
|
|
if (!check_hw_exists(&pmu, x86_pmu.num_counters, x86_pmu.num_counters_fixed))
|
2022-06-01 17:45:17 +08:00
|
|
|
goto out_bad_pmu;
|
2010-11-23 05:55:23 +08:00
|
|
|
|
2009-05-29 17:25:09 +08:00
|
|
|
pr_cont("%s PMU driver.\n", x86_pmu.name);
|
2009-04-29 18:47:13 +08:00
|
|
|
|
2014-02-06 03:48:51 +08:00
|
|
|
x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
|
|
|
|
|
2011-12-06 21:07:15 +08:00
|
|
|
for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
|
|
|
|
quirk->func();
|
2010-03-05 04:49:01 +08:00
|
|
|
|
2012-06-21 02:46:34 +08:00
|
|
|
if (!x86_pmu.intel_ctrl)
|
|
|
|
x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
|
2008-12-03 17:39:53 +08:00
|
|
|
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
perf_events_lapic_init();
|
2011-10-01 03:06:21 +08:00
|
|
|
register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
|
2009-05-29 17:25:09 +08:00
|
|
|
|
2010-01-22 23:32:17 +08:00
|
|
|
unconstrained = (struct event_constraint)
|
2010-03-30 00:36:50 +08:00
|
|
|
__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
|
2013-01-24 23:10:27 +08:00
|
|
|
0, x86_pmu.num_counters, 0, 0);
|
2010-01-22 23:32:17 +08:00
|
|
|
|
2012-03-16 03:09:14 +08:00
|
|
|
x86_pmu_format_group.attrs = x86_pmu.format_attrs;
|
2011-11-21 06:30:47 +08:00
|
|
|
|
2012-10-10 20:53:11 +08:00
|
|
|
if (!x86_pmu.events_sysfs_show)
|
|
|
|
x86_pmu_events_group.attrs = &empty_attrs;
|
2013-01-24 23:10:25 +08:00
|
|
|
|
2019-05-12 23:55:13 +08:00
|
|
|
pmu.attr_update = x86_pmu.attr_update;
|
2017-05-12 22:51:13 +08:00
|
|
|
|
perf/x86: Register hybrid PMUs
Different hybrid PMUs have different PMU capabilities and events. Perf
should registers a dedicated PMU for each of them.
To check the X86 event, perf has to go through all possible hybrid pmus.
All the hybrid PMUs are registered at boot time. Before the
registration, add intel_pmu_check_hybrid_pmus() to check and update the
counters information, the event constraints, the extra registers and the
unique capabilities for each hybrid PMUs.
Postpone the display of the PMU information and HW check to
CPU_STARTING, because the boot CPU is the only online CPU in the
init_hw_perf_events(). Perf doesn't know the availability of the other
PMUs. Perf should display the PMU information only if the counters of
the PMU are available.
One type of CPUs may be all offline. For this case, users can still
observe the PMU in /sys/devices, but its CPU mask is 0.
All hybrid PMUs have capability PERF_PMU_CAP_HETEROGENEOUS_CPUS.
The PMU name for hybrid PMUs will be "cpu_XXX", which will be assigned
later in a separated patch.
The PMU type id for the core PMU is still PERF_TYPE_RAW. For the other
hybrid PMUs, the PMU type id is not hard code.
The event->cpu must be compatitable with the supported CPUs of the PMU.
Add a check in the x86_pmu_event_init().
The events in a group must be from the same type of hybrid PMU.
The fake cpuc used in the validation must be from the supported CPU of
the event->pmu.
Perf may not retrieve a valid core type from get_this_hybrid_cpu_type().
For example, ADL may have an alternative configuration. With that
configuration, Perf cannot retrieve the core type from the CPUID leaf
0x1a. Add a platform specific get_hybrid_cpu_type(). If the generic way
fails, invoke the platform specific get_hybrid_cpu_type().
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1618237865-33448-17-git-send-email-kan.liang@linux.intel.com
2021-04-12 22:30:56 +08:00
|
|
|
if (!is_hybrid()) {
|
|
|
|
x86_pmu_show_pmu_cap(x86_pmu.num_counters,
|
|
|
|
x86_pmu.num_counters_fixed,
|
|
|
|
x86_pmu.intel_ctrl);
|
|
|
|
}
|
2010-03-05 20:01:18 +08:00
|
|
|
|
2020-08-18 21:57:53 +08:00
|
|
|
if (!x86_pmu.read)
|
|
|
|
x86_pmu.read = _x86_pmu_read;
|
|
|
|
|
2021-01-25 20:14:58 +08:00
|
|
|
if (!x86_pmu.guest_get_msrs)
|
2021-03-10 01:10:19 +08:00
|
|
|
x86_pmu.guest_get_msrs = (void *)&__static_call_return0;
|
2021-01-25 20:14:58 +08:00
|
|
|
|
2022-05-11 03:27:22 +08:00
|
|
|
if (!x86_pmu.set_period)
|
|
|
|
x86_pmu.set_period = x86_perf_event_set_period;
|
|
|
|
|
|
|
|
if (!x86_pmu.update)
|
|
|
|
x86_pmu.update = x86_perf_event_update;
|
|
|
|
|
2020-08-18 21:57:53 +08:00
|
|
|
x86_pmu_static_call_update();
|
|
|
|
|
2016-07-14 01:16:10 +08:00
|
|
|
/*
|
|
|
|
* Install callbacks. Core will call them for each online
|
|
|
|
* cpu.
|
|
|
|
*/
|
2016-12-22 03:19:54 +08:00
|
|
|
err = cpuhp_setup_state(CPUHP_PERF_X86_PREPARE, "perf/x86:prepare",
|
2016-07-14 01:16:10 +08:00
|
|
|
x86_pmu_prepare_cpu, x86_pmu_dead_cpu);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
err = cpuhp_setup_state(CPUHP_AP_PERF_X86_STARTING,
|
2016-12-22 03:19:54 +08:00
|
|
|
"perf/x86:starting", x86_pmu_starting_cpu,
|
2016-07-14 01:16:10 +08:00
|
|
|
x86_pmu_dying_cpu);
|
|
|
|
if (err)
|
|
|
|
goto out;
|
|
|
|
|
2016-12-22 03:19:54 +08:00
|
|
|
err = cpuhp_setup_state(CPUHP_AP_PERF_X86_ONLINE, "perf/x86:online",
|
2016-07-14 01:16:10 +08:00
|
|
|
x86_pmu_online_cpu, NULL);
|
|
|
|
if (err)
|
|
|
|
goto out1;
|
|
|
|
|
perf/x86: Register hybrid PMUs
Different hybrid PMUs have different PMU capabilities and events. Perf
should registers a dedicated PMU for each of them.
To check the X86 event, perf has to go through all possible hybrid pmus.
All the hybrid PMUs are registered at boot time. Before the
registration, add intel_pmu_check_hybrid_pmus() to check and update the
counters information, the event constraints, the extra registers and the
unique capabilities for each hybrid PMUs.
Postpone the display of the PMU information and HW check to
CPU_STARTING, because the boot CPU is the only online CPU in the
init_hw_perf_events(). Perf doesn't know the availability of the other
PMUs. Perf should display the PMU information only if the counters of
the PMU are available.
One type of CPUs may be all offline. For this case, users can still
observe the PMU in /sys/devices, but its CPU mask is 0.
All hybrid PMUs have capability PERF_PMU_CAP_HETEROGENEOUS_CPUS.
The PMU name for hybrid PMUs will be "cpu_XXX", which will be assigned
later in a separated patch.
The PMU type id for the core PMU is still PERF_TYPE_RAW. For the other
hybrid PMUs, the PMU type id is not hard code.
The event->cpu must be compatitable with the supported CPUs of the PMU.
Add a check in the x86_pmu_event_init().
The events in a group must be from the same type of hybrid PMU.
The fake cpuc used in the validation must be from the supported CPU of
the event->pmu.
Perf may not retrieve a valid core type from get_this_hybrid_cpu_type().
For example, ADL may have an alternative configuration. With that
configuration, Perf cannot retrieve the core type from the CPUID leaf
0x1a. Add a platform specific get_hybrid_cpu_type(). If the generic way
fails, invoke the platform specific get_hybrid_cpu_type().
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1618237865-33448-17-git-send-email-kan.liang@linux.intel.com
2021-04-12 22:30:56 +08:00
|
|
|
if (!is_hybrid()) {
|
|
|
|
err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
|
|
|
|
if (err)
|
|
|
|
goto out2;
|
|
|
|
} else {
|
|
|
|
struct x86_hybrid_pmu *hybrid_pmu;
|
|
|
|
int i, j;
|
|
|
|
|
|
|
|
for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
|
|
|
|
hybrid_pmu = &x86_pmu.hybrid_pmu[i];
|
|
|
|
|
|
|
|
hybrid_pmu->pmu = pmu;
|
|
|
|
hybrid_pmu->pmu.type = -1;
|
|
|
|
hybrid_pmu->pmu.attr_update = x86_pmu.attr_update;
|
|
|
|
hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_HETEROGENEOUS_CPUS;
|
perf: Extend PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
Current Hardware events and Hardware cache events have special perf
types, PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE. The two types don't
pass the PMU type in the user interface. For a hybrid system, the perf
subsystem doesn't know which PMU the events belong to. The first capable
PMU will always be assigned to the events. The events never get a chance
to run on the other capable PMUs.
Extend the two types to become PMU aware types. The PMU type ID is
stored at attr.config[63:32].
Add a new PMU capability, PERF_PMU_CAP_EXTENDED_HW_TYPE, to indicate a
PMU which supports the extended PERF_TYPE_HARDWARE and
PERF_TYPE_HW_CACHE.
The PMU type is only required when searching a specific PMU. The PMU
specific codes will only be interested in the 'real' config value, which
is stored in the low 32 bit of the event->attr.config. Update the
event->attr.config in the generic code, so the PMU specific codes don't
need to calculate it separately.
If a user specifies a PMU type, but the PMU doesn't support the extended
type, error out.
If an event cannot be initialized in a PMU specified by a user, error
out immediately. Perf should not try to open it on other PMUs.
The new PMU capability is only set for the X86 hybrid PMUs for now.
Other architectures, e.g., ARM, may need it as well. The support on ARM
may be implemented later separately.
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1618237865-33448-22-git-send-email-kan.liang@linux.intel.com
2021-04-12 22:31:01 +08:00
|
|
|
hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_EXTENDED_HW_TYPE;
|
perf/x86: Register hybrid PMUs
Different hybrid PMUs have different PMU capabilities and events. Perf
should registers a dedicated PMU for each of them.
To check the X86 event, perf has to go through all possible hybrid pmus.
All the hybrid PMUs are registered at boot time. Before the
registration, add intel_pmu_check_hybrid_pmus() to check and update the
counters information, the event constraints, the extra registers and the
unique capabilities for each hybrid PMUs.
Postpone the display of the PMU information and HW check to
CPU_STARTING, because the boot CPU is the only online CPU in the
init_hw_perf_events(). Perf doesn't know the availability of the other
PMUs. Perf should display the PMU information only if the counters of
the PMU are available.
One type of CPUs may be all offline. For this case, users can still
observe the PMU in /sys/devices, but its CPU mask is 0.
All hybrid PMUs have capability PERF_PMU_CAP_HETEROGENEOUS_CPUS.
The PMU name for hybrid PMUs will be "cpu_XXX", which will be assigned
later in a separated patch.
The PMU type id for the core PMU is still PERF_TYPE_RAW. For the other
hybrid PMUs, the PMU type id is not hard code.
The event->cpu must be compatitable with the supported CPUs of the PMU.
Add a check in the x86_pmu_event_init().
The events in a group must be from the same type of hybrid PMU.
The fake cpuc used in the validation must be from the supported CPU of
the event->pmu.
Perf may not retrieve a valid core type from get_this_hybrid_cpu_type().
For example, ADL may have an alternative configuration. With that
configuration, Perf cannot retrieve the core type from the CPUID leaf
0x1a. Add a platform specific get_hybrid_cpu_type(). If the generic way
fails, invoke the platform specific get_hybrid_cpu_type().
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1618237865-33448-17-git-send-email-kan.liang@linux.intel.com
2021-04-12 22:30:56 +08:00
|
|
|
|
|
|
|
err = perf_pmu_register(&hybrid_pmu->pmu, hybrid_pmu->name,
|
|
|
|
(hybrid_pmu->cpu_type == hybrid_big) ? PERF_TYPE_RAW : -1);
|
|
|
|
if (err)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (i < x86_pmu.num_hybrid_pmus) {
|
|
|
|
for (j = 0; j < i; j++)
|
|
|
|
perf_pmu_unregister(&x86_pmu.hybrid_pmu[j].pmu);
|
|
|
|
pr_warn("Failed to register hybrid PMUs\n");
|
|
|
|
kfree(x86_pmu.hybrid_pmu);
|
|
|
|
x86_pmu.hybrid_pmu = NULL;
|
|
|
|
x86_pmu.num_hybrid_pmus = 0;
|
|
|
|
goto out2;
|
|
|
|
}
|
|
|
|
}
|
2010-11-26 01:38:29 +08:00
|
|
|
|
|
|
|
return 0;
|
2016-07-14 01:16:10 +08:00
|
|
|
|
|
|
|
out2:
|
|
|
|
cpuhp_remove_state(CPUHP_AP_PERF_X86_ONLINE);
|
|
|
|
out1:
|
|
|
|
cpuhp_remove_state(CPUHP_AP_PERF_X86_STARTING);
|
|
|
|
out:
|
|
|
|
cpuhp_remove_state(CPUHP_PERF_X86_PREPARE);
|
2022-06-01 17:45:17 +08:00
|
|
|
out_bad_pmu:
|
|
|
|
memset(&x86_pmu, 0, sizeof(x86_pmu));
|
2016-07-14 01:16:10 +08:00
|
|
|
return err;
|
2008-12-03 17:39:53 +08:00
|
|
|
}
|
2010-11-26 01:38:29 +08:00
|
|
|
early_initcall(init_hw_perf_events);
|
2008-12-11 19:46:46 +08:00
|
|
|
|
2020-08-18 21:57:53 +08:00
|
|
|
static void x86_pmu_read(struct perf_event *event)
|
2008-12-13 16:00:03 +08:00
|
|
|
{
|
2020-08-18 21:57:53 +08:00
|
|
|
static_call(x86_pmu_read)(event);
|
2008-12-13 16:00:03 +08:00
|
|
|
}
|
|
|
|
|
2010-04-23 13:56:12 +08:00
|
|
|
/*
|
|
|
|
* Start group events scheduling transaction
|
|
|
|
* Set the flag to make pmu::enable() not perform the
|
|
|
|
* schedulability test, it will be performed at commit time
|
2015-09-04 11:07:45 +08:00
|
|
|
*
|
|
|
|
* We only support PERF_PMU_TXN_ADD transactions. Save the
|
|
|
|
* transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
|
|
|
|
* transactions.
|
2010-04-23 13:56:12 +08:00
|
|
|
*/
|
2015-09-04 11:07:45 +08:00
|
|
|
static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags)
|
2010-04-23 13:56:12 +08:00
|
|
|
{
|
2015-09-04 11:07:45 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
|
|
|
|
|
|
|
WARN_ON_ONCE(cpuc->txn_flags); /* txn already in flight */
|
|
|
|
|
|
|
|
cpuc->txn_flags = txn_flags;
|
|
|
|
if (txn_flags & ~PERF_PMU_TXN_ADD)
|
|
|
|
return;
|
|
|
|
|
2010-06-14 14:49:00 +08:00
|
|
|
perf_pmu_disable(pmu);
|
2010-12-18 23:28:55 +08:00
|
|
|
__this_cpu_write(cpu_hw_events.n_txn, 0);
|
2020-10-05 16:09:06 +08:00
|
|
|
__this_cpu_write(cpu_hw_events.n_txn_pair, 0);
|
perf/x86: Fix n_metric for cancelled txn
When a group that has TopDown members is failed to be scheduled, any
later TopDown groups will not return valid values.
Here is an example.
A background perf that occupies all the GP counters and the fixed
counter 1.
$perf stat -e "{cycles,cycles,cycles,cycles,cycles,cycles,cycles,
cycles,cycles}:D" -a
A user monitors a TopDown group. It works well, because the fixed
counter 3 and the PERF_METRICS are available.
$perf stat -x, --topdown -- ./workload
retiring,bad speculation,frontend bound,backend bound,
18.0,16.1,40.4,25.5,
Then the user tries to monitor a group that has TopDown members.
Because of the cycles event, the group is failed to be scheduled.
$perf stat -x, -e '{slots,topdown-retiring,topdown-be-bound,
topdown-fe-bound,topdown-bad-spec,cycles}'
-- ./workload
<not counted>,,slots,0,0.00,,
<not counted>,,topdown-retiring,0,0.00,,
<not counted>,,topdown-be-bound,0,0.00,,
<not counted>,,topdown-fe-bound,0,0.00,,
<not counted>,,topdown-bad-spec,0,0.00,,
<not counted>,,cycles,0,0.00,,
The user tries to monitor a TopDown group again. It doesn't work anymore.
$perf stat -x, --topdown -- ./workload
,,,,,
In a txn, cancel_txn() is to truncate the event_list for a canceled
group and update the number of events added in this transaction.
However, the number of TopDown events added in this transaction is not
updated. The kernel will probably fail to add new Topdown events.
Fixes: 7b2c05a15d29 ("perf/x86/intel: Generic support for hardware TopDown metrics")
Reported-by: Andi Kleen <ak@linux.intel.com>
Reported-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Kan Liang <kan.liang@linux.intel.com>
Link: https://lkml.kernel.org/r/20201005082611.GH2628@hirez.programming.kicks-ass.net
2020-10-05 16:10:24 +08:00
|
|
|
__this_cpu_write(cpu_hw_events.n_txn_metric, 0);
|
2010-04-23 13:56:12 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Stop group events scheduling transaction
|
|
|
|
* Clear the flag and pmu::enable() will perform the
|
|
|
|
* schedulability test.
|
|
|
|
*/
|
2010-06-11 19:35:57 +08:00
|
|
|
static void x86_pmu_cancel_txn(struct pmu *pmu)
|
2010-04-23 13:56:12 +08:00
|
|
|
{
|
2015-09-04 11:07:45 +08:00
|
|
|
unsigned int txn_flags;
|
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
|
|
|
|
|
|
|
WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
|
|
|
|
|
|
|
|
txn_flags = cpuc->txn_flags;
|
|
|
|
cpuc->txn_flags = 0;
|
|
|
|
if (txn_flags & ~PERF_PMU_TXN_ADD)
|
|
|
|
return;
|
|
|
|
|
perf_events: Fix event scheduling issues introduced by transactional API
The transactional API patch between the generic and model-specific
code introduced several important bugs with event scheduling, at
least on X86. If you had pinned events, e.g., watchdog, and were
over-committing the PMU, you would get bogus counts. The bug was
showing up on Intel CPU because events would move around more
often that on AMD. But the problem also existed on AMD, though
harder to expose.
The issues were:
- group_sched_in() was missing a cancel_txn() in the error path
- cpuc->n_added was not properly maintained, leading to missing
actions in hw_perf_enable(), i.e., n_running being 0. You cannot
update n_added until you know the transaction has succeeded. In
case of failed transaction n_added was not adjusted back.
- in case of failed transactions, event_sched_out() was called
and eventually invoked x86_disable_event() to touch the HW reg.
But with transactions, on X86, event_sched_in() does not touch
HW registers, it simply collects events into a list. Thus, you
could end up calling x86_disable_event() on a counter which
did not correspond to the current event when idx != -1.
The patch modifies the generic and X86 code to avoid all those problems.
First, we keep track of the number of events added last. In case the
transaction fails, we substract them from n_added. This approach is
necessary (as opposed to delaying updates to n_added) because not all
event updates use the transaction API, e.g., single events.
Second, we encapsulate the event_sched_in() and event_sched_out() in
group_sched_in() inside the transaction. That makes the operations
symmetrical and you can also detect that you are inside a transaction
and skip the HW reg access by checking cpuc->group_flag.
With this patch, you can now overcommit the PMU even with pinned
system-wide events present and still get valid counts.
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1274796225.5882.1389.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-05-25 22:23:10 +08:00
|
|
|
/*
|
2014-02-24 19:26:21 +08:00
|
|
|
* Truncate collected array by the number of events added in this
|
|
|
|
* transaction. See x86_pmu_add() and x86_pmu_*_txn().
|
perf_events: Fix event scheduling issues introduced by transactional API
The transactional API patch between the generic and model-specific
code introduced several important bugs with event scheduling, at
least on X86. If you had pinned events, e.g., watchdog, and were
over-committing the PMU, you would get bogus counts. The bug was
showing up on Intel CPU because events would move around more
often that on AMD. But the problem also existed on AMD, though
harder to expose.
The issues were:
- group_sched_in() was missing a cancel_txn() in the error path
- cpuc->n_added was not properly maintained, leading to missing
actions in hw_perf_enable(), i.e., n_running being 0. You cannot
update n_added until you know the transaction has succeeded. In
case of failed transaction n_added was not adjusted back.
- in case of failed transactions, event_sched_out() was called
and eventually invoked x86_disable_event() to touch the HW reg.
But with transactions, on X86, event_sched_in() does not touch
HW registers, it simply collects events into a list. Thus, you
could end up calling x86_disable_event() on a counter which
did not correspond to the current event when idx != -1.
The patch modifies the generic and X86 code to avoid all those problems.
First, we keep track of the number of events added last. In case the
transaction fails, we substract them from n_added. This approach is
necessary (as opposed to delaying updates to n_added) because not all
event updates use the transaction API, e.g., single events.
Second, we encapsulate the event_sched_in() and event_sched_out() in
group_sched_in() inside the transaction. That makes the operations
symmetrical and you can also detect that you are inside a transaction
and skip the HW reg access by checking cpuc->group_flag.
With this patch, you can now overcommit the PMU even with pinned
system-wide events present and still get valid counts.
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1274796225.5882.1389.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-05-25 22:23:10 +08:00
|
|
|
*/
|
2010-12-18 23:28:55 +08:00
|
|
|
__this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
|
|
|
|
__this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
|
2020-10-05 16:09:06 +08:00
|
|
|
__this_cpu_sub(cpu_hw_events.n_pair, __this_cpu_read(cpu_hw_events.n_txn_pair));
|
perf/x86: Fix n_metric for cancelled txn
When a group that has TopDown members is failed to be scheduled, any
later TopDown groups will not return valid values.
Here is an example.
A background perf that occupies all the GP counters and the fixed
counter 1.
$perf stat -e "{cycles,cycles,cycles,cycles,cycles,cycles,cycles,
cycles,cycles}:D" -a
A user monitors a TopDown group. It works well, because the fixed
counter 3 and the PERF_METRICS are available.
$perf stat -x, --topdown -- ./workload
retiring,bad speculation,frontend bound,backend bound,
18.0,16.1,40.4,25.5,
Then the user tries to monitor a group that has TopDown members.
Because of the cycles event, the group is failed to be scheduled.
$perf stat -x, -e '{slots,topdown-retiring,topdown-be-bound,
topdown-fe-bound,topdown-bad-spec,cycles}'
-- ./workload
<not counted>,,slots,0,0.00,,
<not counted>,,topdown-retiring,0,0.00,,
<not counted>,,topdown-be-bound,0,0.00,,
<not counted>,,topdown-fe-bound,0,0.00,,
<not counted>,,topdown-bad-spec,0,0.00,,
<not counted>,,cycles,0,0.00,,
The user tries to monitor a TopDown group again. It doesn't work anymore.
$perf stat -x, --topdown -- ./workload
,,,,,
In a txn, cancel_txn() is to truncate the event_list for a canceled
group and update the number of events added in this transaction.
However, the number of TopDown events added in this transaction is not
updated. The kernel will probably fail to add new Topdown events.
Fixes: 7b2c05a15d29 ("perf/x86/intel: Generic support for hardware TopDown metrics")
Reported-by: Andi Kleen <ak@linux.intel.com>
Reported-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Kan Liang <kan.liang@linux.intel.com>
Link: https://lkml.kernel.org/r/20201005082611.GH2628@hirez.programming.kicks-ass.net
2020-10-05 16:10:24 +08:00
|
|
|
__this_cpu_sub(cpu_hw_events.n_metric, __this_cpu_read(cpu_hw_events.n_txn_metric));
|
2010-06-14 14:49:00 +08:00
|
|
|
perf_pmu_enable(pmu);
|
2010-04-23 13:56:12 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Commit group events scheduling transaction
|
|
|
|
* Perform the group schedulability test as a whole
|
|
|
|
* Return 0 if success
|
2014-02-24 19:26:21 +08:00
|
|
|
*
|
|
|
|
* Does not cancel the transaction on failure; expects the caller to do this.
|
2010-04-23 13:56:12 +08:00
|
|
|
*/
|
2010-06-11 19:35:57 +08:00
|
|
|
static int x86_pmu_commit_txn(struct pmu *pmu)
|
2010-04-23 13:56:12 +08:00
|
|
|
{
|
x86: Replace __get_cpu_var uses
__get_cpu_var() is used for multiple purposes in the kernel source. One of
them is address calculation via the form &__get_cpu_var(x). This calculates
the address for the instance of the percpu variable of the current processor
based on an offset.
Other use cases are for storing and retrieving data from the current
processors percpu area. __get_cpu_var() can be used as an lvalue when
writing data or on the right side of an assignment.
__get_cpu_var() is defined as :
#define __get_cpu_var(var) (*this_cpu_ptr(&(var)))
__get_cpu_var() always only does an address determination. However, store
and retrieve operations could use a segment prefix (or global register on
other platforms) to avoid the address calculation.
this_cpu_write() and this_cpu_read() can directly take an offset into a
percpu area and use optimized assembly code to read and write per cpu
variables.
This patch converts __get_cpu_var into either an explicit address
calculation using this_cpu_ptr() or into a use of this_cpu operations that
use the offset. Thereby address calculations are avoided and less registers
are used when code is generated.
Transformations done to __get_cpu_var()
1. Determine the address of the percpu instance of the current processor.
DEFINE_PER_CPU(int, y);
int *x = &__get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(&y);
2. Same as #1 but this time an array structure is involved.
DEFINE_PER_CPU(int, y[20]);
int *x = __get_cpu_var(y);
Converts to
int *x = this_cpu_ptr(y);
3. Retrieve the content of the current processors instance of a per cpu
variable.
DEFINE_PER_CPU(int, y);
int x = __get_cpu_var(y)
Converts to
int x = __this_cpu_read(y);
4. Retrieve the content of a percpu struct
DEFINE_PER_CPU(struct mystruct, y);
struct mystruct x = __get_cpu_var(y);
Converts to
memcpy(&x, this_cpu_ptr(&y), sizeof(x));
5. Assignment to a per cpu variable
DEFINE_PER_CPU(int, y)
__get_cpu_var(y) = x;
Converts to
__this_cpu_write(y, x);
6. Increment/Decrement etc of a per cpu variable
DEFINE_PER_CPU(int, y);
__get_cpu_var(y)++
Converts to
__this_cpu_inc(y)
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: x86@kernel.org
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2014-08-18 01:30:40 +08:00
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
2010-04-23 13:56:12 +08:00
|
|
|
int assign[X86_PMC_IDX_MAX];
|
|
|
|
int n, ret;
|
|
|
|
|
2015-09-04 11:07:45 +08:00
|
|
|
WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
|
|
|
|
|
|
|
|
if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) {
|
|
|
|
cpuc->txn_flags = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2010-04-23 13:56:12 +08:00
|
|
|
n = cpuc->n_events;
|
|
|
|
|
|
|
|
if (!x86_pmu_initialized())
|
|
|
|
return -EAGAIN;
|
|
|
|
|
2020-08-18 21:57:53 +08:00
|
|
|
ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign);
|
2010-04-23 13:56:12 +08:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* copy new assignment, now we know it is possible
|
|
|
|
* will be used by hw_perf_enable()
|
|
|
|
*/
|
|
|
|
memcpy(cpuc->assign, assign, n*sizeof(int));
|
|
|
|
|
2015-09-04 11:07:45 +08:00
|
|
|
cpuc->txn_flags = 0;
|
2010-06-14 14:49:00 +08:00
|
|
|
perf_pmu_enable(pmu);
|
2010-04-23 13:56:12 +08:00
|
|
|
return 0;
|
|
|
|
}
|
2011-06-06 22:57:08 +08:00
|
|
|
/*
|
|
|
|
* a fake_cpuc is used to validate event groups. Due to
|
|
|
|
* the extra reg logic, we need to also allocate a fake
|
|
|
|
* per_core and per_cpu structure. Otherwise, group events
|
|
|
|
* using extra reg may conflict without the kernel being
|
|
|
|
* able to catch this when the last event gets added to
|
|
|
|
* the group.
|
|
|
|
*/
|
|
|
|
static void free_fake_cpuc(struct cpu_hw_events *cpuc)
|
|
|
|
{
|
2019-03-06 05:23:15 +08:00
|
|
|
intel_cpuc_finish(cpuc);
|
2011-06-06 22:57:08 +08:00
|
|
|
kfree(cpuc);
|
|
|
|
}
|
|
|
|
|
perf/x86: Register hybrid PMUs
Different hybrid PMUs have different PMU capabilities and events. Perf
should registers a dedicated PMU for each of them.
To check the X86 event, perf has to go through all possible hybrid pmus.
All the hybrid PMUs are registered at boot time. Before the
registration, add intel_pmu_check_hybrid_pmus() to check and update the
counters information, the event constraints, the extra registers and the
unique capabilities for each hybrid PMUs.
Postpone the display of the PMU information and HW check to
CPU_STARTING, because the boot CPU is the only online CPU in the
init_hw_perf_events(). Perf doesn't know the availability of the other
PMUs. Perf should display the PMU information only if the counters of
the PMU are available.
One type of CPUs may be all offline. For this case, users can still
observe the PMU in /sys/devices, but its CPU mask is 0.
All hybrid PMUs have capability PERF_PMU_CAP_HETEROGENEOUS_CPUS.
The PMU name for hybrid PMUs will be "cpu_XXX", which will be assigned
later in a separated patch.
The PMU type id for the core PMU is still PERF_TYPE_RAW. For the other
hybrid PMUs, the PMU type id is not hard code.
The event->cpu must be compatitable with the supported CPUs of the PMU.
Add a check in the x86_pmu_event_init().
The events in a group must be from the same type of hybrid PMU.
The fake cpuc used in the validation must be from the supported CPU of
the event->pmu.
Perf may not retrieve a valid core type from get_this_hybrid_cpu_type().
For example, ADL may have an alternative configuration. With that
configuration, Perf cannot retrieve the core type from the CPUID leaf
0x1a. Add a platform specific get_hybrid_cpu_type(). If the generic way
fails, invoke the platform specific get_hybrid_cpu_type().
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1618237865-33448-17-git-send-email-kan.liang@linux.intel.com
2021-04-12 22:30:56 +08:00
|
|
|
static struct cpu_hw_events *allocate_fake_cpuc(struct pmu *event_pmu)
|
2011-06-06 22:57:08 +08:00
|
|
|
{
|
|
|
|
struct cpu_hw_events *cpuc;
|
perf/x86: Register hybrid PMUs
Different hybrid PMUs have different PMU capabilities and events. Perf
should registers a dedicated PMU for each of them.
To check the X86 event, perf has to go through all possible hybrid pmus.
All the hybrid PMUs are registered at boot time. Before the
registration, add intel_pmu_check_hybrid_pmus() to check and update the
counters information, the event constraints, the extra registers and the
unique capabilities for each hybrid PMUs.
Postpone the display of the PMU information and HW check to
CPU_STARTING, because the boot CPU is the only online CPU in the
init_hw_perf_events(). Perf doesn't know the availability of the other
PMUs. Perf should display the PMU information only if the counters of
the PMU are available.
One type of CPUs may be all offline. For this case, users can still
observe the PMU in /sys/devices, but its CPU mask is 0.
All hybrid PMUs have capability PERF_PMU_CAP_HETEROGENEOUS_CPUS.
The PMU name for hybrid PMUs will be "cpu_XXX", which will be assigned
later in a separated patch.
The PMU type id for the core PMU is still PERF_TYPE_RAW. For the other
hybrid PMUs, the PMU type id is not hard code.
The event->cpu must be compatitable with the supported CPUs of the PMU.
Add a check in the x86_pmu_event_init().
The events in a group must be from the same type of hybrid PMU.
The fake cpuc used in the validation must be from the supported CPU of
the event->pmu.
Perf may not retrieve a valid core type from get_this_hybrid_cpu_type().
For example, ADL may have an alternative configuration. With that
configuration, Perf cannot retrieve the core type from the CPUID leaf
0x1a. Add a platform specific get_hybrid_cpu_type(). If the generic way
fails, invoke the platform specific get_hybrid_cpu_type().
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1618237865-33448-17-git-send-email-kan.liang@linux.intel.com
2021-04-12 22:30:56 +08:00
|
|
|
int cpu;
|
2011-06-06 22:57:08 +08:00
|
|
|
|
|
|
|
cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
|
|
|
|
if (!cpuc)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
2012-06-05 21:30:31 +08:00
|
|
|
cpuc->is_fake = 1;
|
2019-03-06 05:23:15 +08:00
|
|
|
|
perf/x86: Register hybrid PMUs
Different hybrid PMUs have different PMU capabilities and events. Perf
should registers a dedicated PMU for each of them.
To check the X86 event, perf has to go through all possible hybrid pmus.
All the hybrid PMUs are registered at boot time. Before the
registration, add intel_pmu_check_hybrid_pmus() to check and update the
counters information, the event constraints, the extra registers and the
unique capabilities for each hybrid PMUs.
Postpone the display of the PMU information and HW check to
CPU_STARTING, because the boot CPU is the only online CPU in the
init_hw_perf_events(). Perf doesn't know the availability of the other
PMUs. Perf should display the PMU information only if the counters of
the PMU are available.
One type of CPUs may be all offline. For this case, users can still
observe the PMU in /sys/devices, but its CPU mask is 0.
All hybrid PMUs have capability PERF_PMU_CAP_HETEROGENEOUS_CPUS.
The PMU name for hybrid PMUs will be "cpu_XXX", which will be assigned
later in a separated patch.
The PMU type id for the core PMU is still PERF_TYPE_RAW. For the other
hybrid PMUs, the PMU type id is not hard code.
The event->cpu must be compatitable with the supported CPUs of the PMU.
Add a check in the x86_pmu_event_init().
The events in a group must be from the same type of hybrid PMU.
The fake cpuc used in the validation must be from the supported CPU of
the event->pmu.
Perf may not retrieve a valid core type from get_this_hybrid_cpu_type().
For example, ADL may have an alternative configuration. With that
configuration, Perf cannot retrieve the core type from the CPUID leaf
0x1a. Add a platform specific get_hybrid_cpu_type(). If the generic way
fails, invoke the platform specific get_hybrid_cpu_type().
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1618237865-33448-17-git-send-email-kan.liang@linux.intel.com
2021-04-12 22:30:56 +08:00
|
|
|
if (is_hybrid()) {
|
|
|
|
struct x86_hybrid_pmu *h_pmu;
|
|
|
|
|
|
|
|
h_pmu = hybrid_pmu(event_pmu);
|
|
|
|
if (cpumask_empty(&h_pmu->supported_cpus))
|
|
|
|
goto error;
|
|
|
|
cpu = cpumask_first(&h_pmu->supported_cpus);
|
|
|
|
} else
|
|
|
|
cpu = raw_smp_processor_id();
|
|
|
|
cpuc->pmu = event_pmu;
|
|
|
|
|
2019-03-06 05:23:15 +08:00
|
|
|
if (intel_cpuc_prepare(cpuc, cpu))
|
|
|
|
goto error;
|
|
|
|
|
2011-06-06 22:57:08 +08:00
|
|
|
return cpuc;
|
|
|
|
error:
|
|
|
|
free_fake_cpuc(cpuc);
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
}
|
2010-04-23 13:56:12 +08:00
|
|
|
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
/*
|
|
|
|
* validate that we can schedule this event
|
|
|
|
*/
|
|
|
|
static int validate_event(struct perf_event *event)
|
|
|
|
{
|
|
|
|
struct cpu_hw_events *fake_cpuc;
|
|
|
|
struct event_constraint *c;
|
|
|
|
int ret = 0;
|
|
|
|
|
perf/x86: Register hybrid PMUs
Different hybrid PMUs have different PMU capabilities and events. Perf
should registers a dedicated PMU for each of them.
To check the X86 event, perf has to go through all possible hybrid pmus.
All the hybrid PMUs are registered at boot time. Before the
registration, add intel_pmu_check_hybrid_pmus() to check and update the
counters information, the event constraints, the extra registers and the
unique capabilities for each hybrid PMUs.
Postpone the display of the PMU information and HW check to
CPU_STARTING, because the boot CPU is the only online CPU in the
init_hw_perf_events(). Perf doesn't know the availability of the other
PMUs. Perf should display the PMU information only if the counters of
the PMU are available.
One type of CPUs may be all offline. For this case, users can still
observe the PMU in /sys/devices, but its CPU mask is 0.
All hybrid PMUs have capability PERF_PMU_CAP_HETEROGENEOUS_CPUS.
The PMU name for hybrid PMUs will be "cpu_XXX", which will be assigned
later in a separated patch.
The PMU type id for the core PMU is still PERF_TYPE_RAW. For the other
hybrid PMUs, the PMU type id is not hard code.
The event->cpu must be compatitable with the supported CPUs of the PMU.
Add a check in the x86_pmu_event_init().
The events in a group must be from the same type of hybrid PMU.
The fake cpuc used in the validation must be from the supported CPU of
the event->pmu.
Perf may not retrieve a valid core type from get_this_hybrid_cpu_type().
For example, ADL may have an alternative configuration. With that
configuration, Perf cannot retrieve the core type from the CPUID leaf
0x1a. Add a platform specific get_hybrid_cpu_type(). If the generic way
fails, invoke the platform specific get_hybrid_cpu_type().
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1618237865-33448-17-git-send-email-kan.liang@linux.intel.com
2021-04-12 22:30:56 +08:00
|
|
|
fake_cpuc = allocate_fake_cpuc(event->pmu);
|
2011-06-06 22:57:08 +08:00
|
|
|
if (IS_ERR(fake_cpuc))
|
|
|
|
return PTR_ERR(fake_cpuc);
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
|
2019-03-14 16:57:57 +08:00
|
|
|
c = x86_pmu.get_event_constraints(fake_cpuc, 0, event);
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
|
|
|
|
if (!c || !c->weight)
|
2011-11-10 00:56:37 +08:00
|
|
|
ret = -EINVAL;
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
|
|
|
|
if (x86_pmu.put_event_constraints)
|
|
|
|
x86_pmu.put_event_constraints(fake_cpuc, event);
|
|
|
|
|
2011-06-06 22:57:08 +08:00
|
|
|
free_fake_cpuc(fake_cpuc);
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2010-01-18 16:58:01 +08:00
|
|
|
/*
|
|
|
|
* validate a single event group
|
|
|
|
*
|
|
|
|
* validation include:
|
2010-01-27 15:39:39 +08:00
|
|
|
* - check events are compatible which each other
|
|
|
|
* - events do not compete for the same counter
|
|
|
|
* - number of events <= number of counters
|
2010-01-18 16:58:01 +08:00
|
|
|
*
|
|
|
|
* validation ensures the group can be loaded onto the
|
|
|
|
* PMU if it was the only group available.
|
|
|
|
*/
|
2009-10-08 17:56:07 +08:00
|
|
|
static int validate_group(struct perf_event *event)
|
|
|
|
{
|
2010-01-18 16:58:01 +08:00
|
|
|
struct perf_event *leader = event->group_leader;
|
2010-01-22 21:35:46 +08:00
|
|
|
struct cpu_hw_events *fake_cpuc;
|
2011-11-10 00:56:37 +08:00
|
|
|
int ret = -EINVAL, n;
|
2009-10-08 17:56:07 +08:00
|
|
|
|
perf/x86: Register hybrid PMUs
Different hybrid PMUs have different PMU capabilities and events. Perf
should registers a dedicated PMU for each of them.
To check the X86 event, perf has to go through all possible hybrid pmus.
All the hybrid PMUs are registered at boot time. Before the
registration, add intel_pmu_check_hybrid_pmus() to check and update the
counters information, the event constraints, the extra registers and the
unique capabilities for each hybrid PMUs.
Postpone the display of the PMU information and HW check to
CPU_STARTING, because the boot CPU is the only online CPU in the
init_hw_perf_events(). Perf doesn't know the availability of the other
PMUs. Perf should display the PMU information only if the counters of
the PMU are available.
One type of CPUs may be all offline. For this case, users can still
observe the PMU in /sys/devices, but its CPU mask is 0.
All hybrid PMUs have capability PERF_PMU_CAP_HETEROGENEOUS_CPUS.
The PMU name for hybrid PMUs will be "cpu_XXX", which will be assigned
later in a separated patch.
The PMU type id for the core PMU is still PERF_TYPE_RAW. For the other
hybrid PMUs, the PMU type id is not hard code.
The event->cpu must be compatitable with the supported CPUs of the PMU.
Add a check in the x86_pmu_event_init().
The events in a group must be from the same type of hybrid PMU.
The fake cpuc used in the validation must be from the supported CPU of
the event->pmu.
Perf may not retrieve a valid core type from get_this_hybrid_cpu_type().
For example, ADL may have an alternative configuration. With that
configuration, Perf cannot retrieve the core type from the CPUID leaf
0x1a. Add a platform specific get_hybrid_cpu_type(). If the generic way
fails, invoke the platform specific get_hybrid_cpu_type().
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1618237865-33448-17-git-send-email-kan.liang@linux.intel.com
2021-04-12 22:30:56 +08:00
|
|
|
/*
|
|
|
|
* Reject events from different hybrid PMUs.
|
|
|
|
*/
|
|
|
|
if (is_hybrid()) {
|
|
|
|
struct perf_event *sibling;
|
|
|
|
struct pmu *pmu = NULL;
|
|
|
|
|
|
|
|
if (is_x86_event(leader))
|
|
|
|
pmu = leader->pmu;
|
|
|
|
|
|
|
|
for_each_sibling_event(sibling, leader) {
|
|
|
|
if (!is_x86_event(sibling))
|
|
|
|
continue;
|
|
|
|
if (!pmu)
|
|
|
|
pmu = sibling->pmu;
|
|
|
|
else if (pmu != sibling->pmu)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fake_cpuc = allocate_fake_cpuc(event->pmu);
|
2011-06-06 22:57:08 +08:00
|
|
|
if (IS_ERR(fake_cpuc))
|
|
|
|
return PTR_ERR(fake_cpuc);
|
2010-01-18 16:58:01 +08:00
|
|
|
/*
|
|
|
|
* the event is not yet connected with its
|
|
|
|
* siblings therefore we must first collect
|
|
|
|
* existing siblings, then add the new event
|
|
|
|
* before we can simulate the scheduling
|
|
|
|
*/
|
2010-01-22 21:35:46 +08:00
|
|
|
n = collect_events(fake_cpuc, leader, true);
|
2010-01-18 16:58:01 +08:00
|
|
|
if (n < 0)
|
2011-06-06 22:57:08 +08:00
|
|
|
goto out;
|
2009-10-08 17:56:07 +08:00
|
|
|
|
2010-01-22 21:35:46 +08:00
|
|
|
fake_cpuc->n_events = n;
|
|
|
|
n = collect_events(fake_cpuc, event, false);
|
2010-01-18 16:58:01 +08:00
|
|
|
if (n < 0)
|
2011-06-06 22:57:08 +08:00
|
|
|
goto out;
|
2009-10-08 17:56:07 +08:00
|
|
|
|
perf/x86: Remove PERF_X86_EVENT_COMMITTED
The flag PERF_X86_EVENT_COMMITTED is used to find uncommitted events
for which to call put_event_constraint() when scheduling fails.
These are the newly added events to the list, and must form, per
definition, the tail of cpuc->event_list[]. By computing the list
index of the last successfull schedule, then iteration can start there
and the flag is redundant.
There are only 3 callers of x86_schedule_events(), notably:
- x86_pmu_add()
- x86_pmu_commit_txn()
- validate_group()
For x86_pmu_add(), cpuc->n_events isn't updated until after
schedule_events() succeeds, therefore cpuc->n_events points to the
desired index.
For x86_pmu_commit_txn(), cpuc->n_events is updated, but we can
trivially compute the desired value with cpuc->n_txn -- the number of
events added in this transaction.
For validate_group(), we can make the rule for x86_pmu_add() work by
simply setting cpuc->n_events to 0 before calling schedule_events().
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2019-03-14 19:58:52 +08:00
|
|
|
fake_cpuc->n_events = 0;
|
2010-03-12 00:54:39 +08:00
|
|
|
ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
|
2010-01-22 21:35:46 +08:00
|
|
|
|
|
|
|
out:
|
2011-06-06 22:57:08 +08:00
|
|
|
free_fake_cpuc(fake_cpuc);
|
2010-01-22 21:35:46 +08:00
|
|
|
return ret;
|
2009-10-08 17:56:07 +08:00
|
|
|
}
|
|
|
|
|
2011-01-22 07:30:01 +08:00
|
|
|
static int x86_pmu_event_init(struct perf_event *event)
|
2008-12-11 19:46:46 +08:00
|
|
|
{
|
perf/x86: Register hybrid PMUs
Different hybrid PMUs have different PMU capabilities and events. Perf
should registers a dedicated PMU for each of them.
To check the X86 event, perf has to go through all possible hybrid pmus.
All the hybrid PMUs are registered at boot time. Before the
registration, add intel_pmu_check_hybrid_pmus() to check and update the
counters information, the event constraints, the extra registers and the
unique capabilities for each hybrid PMUs.
Postpone the display of the PMU information and HW check to
CPU_STARTING, because the boot CPU is the only online CPU in the
init_hw_perf_events(). Perf doesn't know the availability of the other
PMUs. Perf should display the PMU information only if the counters of
the PMU are available.
One type of CPUs may be all offline. For this case, users can still
observe the PMU in /sys/devices, but its CPU mask is 0.
All hybrid PMUs have capability PERF_PMU_CAP_HETEROGENEOUS_CPUS.
The PMU name for hybrid PMUs will be "cpu_XXX", which will be assigned
later in a separated patch.
The PMU type id for the core PMU is still PERF_TYPE_RAW. For the other
hybrid PMUs, the PMU type id is not hard code.
The event->cpu must be compatitable with the supported CPUs of the PMU.
Add a check in the x86_pmu_event_init().
The events in a group must be from the same type of hybrid PMU.
The fake cpuc used in the validation must be from the supported CPU of
the event->pmu.
Perf may not retrieve a valid core type from get_this_hybrid_cpu_type().
For example, ADL may have an alternative configuration. With that
configuration, Perf cannot retrieve the core type from the CPUID leaf
0x1a. Add a platform specific get_hybrid_cpu_type(). If the generic way
fails, invoke the platform specific get_hybrid_cpu_type().
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1618237865-33448-17-git-send-email-kan.liang@linux.intel.com
2021-04-12 22:30:56 +08:00
|
|
|
struct x86_hybrid_pmu *pmu = NULL;
|
2008-12-11 19:46:46 +08:00
|
|
|
int err;
|
|
|
|
|
perf/x86: Register hybrid PMUs
Different hybrid PMUs have different PMU capabilities and events. Perf
should registers a dedicated PMU for each of them.
To check the X86 event, perf has to go through all possible hybrid pmus.
All the hybrid PMUs are registered at boot time. Before the
registration, add intel_pmu_check_hybrid_pmus() to check and update the
counters information, the event constraints, the extra registers and the
unique capabilities for each hybrid PMUs.
Postpone the display of the PMU information and HW check to
CPU_STARTING, because the boot CPU is the only online CPU in the
init_hw_perf_events(). Perf doesn't know the availability of the other
PMUs. Perf should display the PMU information only if the counters of
the PMU are available.
One type of CPUs may be all offline. For this case, users can still
observe the PMU in /sys/devices, but its CPU mask is 0.
All hybrid PMUs have capability PERF_PMU_CAP_HETEROGENEOUS_CPUS.
The PMU name for hybrid PMUs will be "cpu_XXX", which will be assigned
later in a separated patch.
The PMU type id for the core PMU is still PERF_TYPE_RAW. For the other
hybrid PMUs, the PMU type id is not hard code.
The event->cpu must be compatitable with the supported CPUs of the PMU.
Add a check in the x86_pmu_event_init().
The events in a group must be from the same type of hybrid PMU.
The fake cpuc used in the validation must be from the supported CPU of
the event->pmu.
Perf may not retrieve a valid core type from get_this_hybrid_cpu_type().
For example, ADL may have an alternative configuration. With that
configuration, Perf cannot retrieve the core type from the CPUID leaf
0x1a. Add a platform specific get_hybrid_cpu_type(). If the generic way
fails, invoke the platform specific get_hybrid_cpu_type().
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1618237865-33448-17-git-send-email-kan.liang@linux.intel.com
2021-04-12 22:30:56 +08:00
|
|
|
if ((event->attr.type != event->pmu->type) &&
|
|
|
|
(event->attr.type != PERF_TYPE_HARDWARE) &&
|
|
|
|
(event->attr.type != PERF_TYPE_HW_CACHE))
|
2010-06-11 19:35:08 +08:00
|
|
|
return -ENOENT;
|
perf/x86: Register hybrid PMUs
Different hybrid PMUs have different PMU capabilities and events. Perf
should registers a dedicated PMU for each of them.
To check the X86 event, perf has to go through all possible hybrid pmus.
All the hybrid PMUs are registered at boot time. Before the
registration, add intel_pmu_check_hybrid_pmus() to check and update the
counters information, the event constraints, the extra registers and the
unique capabilities for each hybrid PMUs.
Postpone the display of the PMU information and HW check to
CPU_STARTING, because the boot CPU is the only online CPU in the
init_hw_perf_events(). Perf doesn't know the availability of the other
PMUs. Perf should display the PMU information only if the counters of
the PMU are available.
One type of CPUs may be all offline. For this case, users can still
observe the PMU in /sys/devices, but its CPU mask is 0.
All hybrid PMUs have capability PERF_PMU_CAP_HETEROGENEOUS_CPUS.
The PMU name for hybrid PMUs will be "cpu_XXX", which will be assigned
later in a separated patch.
The PMU type id for the core PMU is still PERF_TYPE_RAW. For the other
hybrid PMUs, the PMU type id is not hard code.
The event->cpu must be compatitable with the supported CPUs of the PMU.
Add a check in the x86_pmu_event_init().
The events in a group must be from the same type of hybrid PMU.
The fake cpuc used in the validation must be from the supported CPU of
the event->pmu.
Perf may not retrieve a valid core type from get_this_hybrid_cpu_type().
For example, ADL may have an alternative configuration. With that
configuration, Perf cannot retrieve the core type from the CPUID leaf
0x1a. Add a platform specific get_hybrid_cpu_type(). If the generic way
fails, invoke the platform specific get_hybrid_cpu_type().
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1618237865-33448-17-git-send-email-kan.liang@linux.intel.com
2021-04-12 22:30:56 +08:00
|
|
|
|
|
|
|
if (is_hybrid() && (event->cpu != -1)) {
|
|
|
|
pmu = hybrid_pmu(event->pmu);
|
|
|
|
if (!cpumask_test_cpu(event->cpu, &pmu->supported_cpus))
|
|
|
|
return -ENOENT;
|
2010-06-11 19:35:08 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
err = __x86_pmu_event_init(event);
|
2009-10-08 17:56:07 +08:00
|
|
|
if (!err) {
|
|
|
|
if (event->group_leader != event)
|
|
|
|
err = validate_group(event);
|
perf, x86: Add PEBS infrastructure
This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.
This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.
With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.
This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).
It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-03-03 02:52:12 +08:00
|
|
|
else
|
|
|
|
err = validate_event(event);
|
2009-10-08 17:56:07 +08:00
|
|
|
}
|
2009-09-09 16:04:47 +08:00
|
|
|
if (err) {
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
if (event->destroy)
|
|
|
|
event->destroy(event);
|
2021-09-29 15:04:21 +08:00
|
|
|
event->destroy = NULL;
|
2009-09-09 16:04:47 +08:00
|
|
|
}
|
2008-12-11 19:46:46 +08:00
|
|
|
|
2018-02-13 06:20:35 +08:00
|
|
|
if (READ_ONCE(x86_pmu.attr_rdpmc) &&
|
2018-03-12 22:45:37 +08:00
|
|
|
!(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS))
|
2021-12-09 04:11:20 +08:00
|
|
|
event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT;
|
2014-10-25 06:58:12 +08:00
|
|
|
|
2010-06-11 19:35:08 +08:00
|
|
|
return err;
|
2008-12-11 19:46:46 +08:00
|
|
|
}
|
2009-03-31 01:07:15 +08:00
|
|
|
|
perf/x86: Reset the dirty counter to prevent the leak for an RDPMC task
The counter value of a perf task may leak to another RDPMC task.
For example, a perf stat task as below is running on CPU 0.
perf stat -e 'branches,cycles' -- taskset -c 0 ./workload
In the meantime, an RDPMC task, which is also running on CPU 0, may read
the GP counters periodically. (The RDPMC task creates a fixed event,
but read four GP counters.)
$./rdpmc_read_all_counters
index 0x0 value 0x8001e5970f99
index 0x1 value 0x8005d750edb6
index 0x2 value 0x0
index 0x3 value 0x0
index 0x0 value 0x8002358e48a5
index 0x1 value 0x8006bd1e3bc9
index 0x2 value 0x0
index 0x3 value 0x0
It is a potential security issue. Once the attacker knows what the other
thread is counting. The PerfMon counter can be used as a side-channel to
attack cryptosystems.
The counter value of the perf stat task leaks to the RDPMC task because
perf never clears the counter when it's stopped.
Three methods were considered to address the issue.
- Unconditionally reset the counter in x86_pmu_del(). It can bring extra
overhead even when there is no RDPMC task running.
- Only reset the un-assigned dirty counters when the RDPMC task is
scheduled in via sched_task(). It fails for the below case.
Thread A Thread B
clone(CLONE_THREAD) --->
set_affine(0)
set_affine(1)
while (!event-enabled)
;
event = perf_event_open()
mmap(event)
ioctl(event, IOC_ENABLE); --->
RDPMC
Counters are still leaked to the thread B.
- Only reset the un-assigned dirty counters before updating the CR4.PCE
bit. The method is implemented here.
The dirty counter is a counter, on which the assigned event has been
deleted, but the counter is not reset. To track the dirty counters,
add a 'dirty' variable in the struct cpu_hw_events.
The security issue can only be found with an RDPMC task. To enable the
RDMPC, the CR4.PCE bit has to be updated. Add a
perf_clear_dirty_counters() right before updating the CR4.PCE bit to
clear the existing dirty counters. Only the current un-assigned dirty
counters are reset, because the RDPMC assigned dirty counters will be
updated soon.
After applying the patch,
$ ./rdpmc_read_all_counters
index 0x0 value 0x0
index 0x1 value 0x0
index 0x2 value 0x0
index 0x3 value 0x0
index 0x0 value 0x0
index 0x1 value 0x0
index 0x2 value 0x0
index 0x3 value 0x0
Performance
The performance of a context switch only be impacted when there are two
or more perf users and one of the users must be an RDPMC user. In other
cases, there is no performance impact.
The worst-case occurs when there are two users: the RDPMC user only
uses one counter; while the other user uses all available counters.
When the RDPMC task is scheduled in, all the counters, other than the
RDPMC assigned one, have to be reset.
Test results for the worst-case, using a modified lat_ctx as measured
on an Ice Lake platform, which has 8 GP and 3 FP counters (ignoring
SLOTS).
lat_ctx -s 128K -N 1000 processes 2
Without the patch:
The context switch time is 4.97 us
With the patch:
The context switch time is 5.16 us
There is ~4% performance drop for the context switching time in the
worst-case.
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1623693582-187370-1-git-send-email-kan.liang@linux.intel.com
2021-06-15 01:59:42 +08:00
|
|
|
void perf_clear_dirty_counters(void)
|
|
|
|
{
|
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
|
|
|
int i;
|
|
|
|
|
|
|
|
/* Don't need to clear the assigned counter. */
|
|
|
|
for (i = 0; i < cpuc->n_events; i++)
|
|
|
|
__clear_bit(cpuc->assign[i], cpuc->dirty);
|
|
|
|
|
|
|
|
if (bitmap_empty(cpuc->dirty, X86_PMC_IDX_MAX))
|
|
|
|
return;
|
|
|
|
|
|
|
|
for_each_set_bit(i, cpuc->dirty, X86_PMC_IDX_MAX) {
|
2021-07-29 17:14:57 +08:00
|
|
|
if (i >= INTEL_PMC_IDX_FIXED) {
|
|
|
|
/* Metrics and fake events don't have corresponding HW counters. */
|
|
|
|
if ((i - INTEL_PMC_IDX_FIXED) >= hybrid(cpuc->pmu, num_counters_fixed))
|
|
|
|
continue;
|
|
|
|
|
perf/x86: Reset the dirty counter to prevent the leak for an RDPMC task
The counter value of a perf task may leak to another RDPMC task.
For example, a perf stat task as below is running on CPU 0.
perf stat -e 'branches,cycles' -- taskset -c 0 ./workload
In the meantime, an RDPMC task, which is also running on CPU 0, may read
the GP counters periodically. (The RDPMC task creates a fixed event,
but read four GP counters.)
$./rdpmc_read_all_counters
index 0x0 value 0x8001e5970f99
index 0x1 value 0x8005d750edb6
index 0x2 value 0x0
index 0x3 value 0x0
index 0x0 value 0x8002358e48a5
index 0x1 value 0x8006bd1e3bc9
index 0x2 value 0x0
index 0x3 value 0x0
It is a potential security issue. Once the attacker knows what the other
thread is counting. The PerfMon counter can be used as a side-channel to
attack cryptosystems.
The counter value of the perf stat task leaks to the RDPMC task because
perf never clears the counter when it's stopped.
Three methods were considered to address the issue.
- Unconditionally reset the counter in x86_pmu_del(). It can bring extra
overhead even when there is no RDPMC task running.
- Only reset the un-assigned dirty counters when the RDPMC task is
scheduled in via sched_task(). It fails for the below case.
Thread A Thread B
clone(CLONE_THREAD) --->
set_affine(0)
set_affine(1)
while (!event-enabled)
;
event = perf_event_open()
mmap(event)
ioctl(event, IOC_ENABLE); --->
RDPMC
Counters are still leaked to the thread B.
- Only reset the un-assigned dirty counters before updating the CR4.PCE
bit. The method is implemented here.
The dirty counter is a counter, on which the assigned event has been
deleted, but the counter is not reset. To track the dirty counters,
add a 'dirty' variable in the struct cpu_hw_events.
The security issue can only be found with an RDPMC task. To enable the
RDMPC, the CR4.PCE bit has to be updated. Add a
perf_clear_dirty_counters() right before updating the CR4.PCE bit to
clear the existing dirty counters. Only the current un-assigned dirty
counters are reset, because the RDPMC assigned dirty counters will be
updated soon.
After applying the patch,
$ ./rdpmc_read_all_counters
index 0x0 value 0x0
index 0x1 value 0x0
index 0x2 value 0x0
index 0x3 value 0x0
index 0x0 value 0x0
index 0x1 value 0x0
index 0x2 value 0x0
index 0x3 value 0x0
Performance
The performance of a context switch only be impacted when there are two
or more perf users and one of the users must be an RDPMC user. In other
cases, there is no performance impact.
The worst-case occurs when there are two users: the RDPMC user only
uses one counter; while the other user uses all available counters.
When the RDPMC task is scheduled in, all the counters, other than the
RDPMC assigned one, have to be reset.
Test results for the worst-case, using a modified lat_ctx as measured
on an Ice Lake platform, which has 8 GP and 3 FP counters (ignoring
SLOTS).
lat_ctx -s 128K -N 1000 processes 2
Without the patch:
The context switch time is 4.97 us
With the patch:
The context switch time is 5.16 us
There is ~4% performance drop for the context switching time in the
worst-case.
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1623693582-187370-1-git-send-email-kan.liang@linux.intel.com
2021-06-15 01:59:42 +08:00
|
|
|
wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + (i - INTEL_PMC_IDX_FIXED), 0);
|
2021-07-29 17:14:57 +08:00
|
|
|
} else {
|
perf/x86: Reset the dirty counter to prevent the leak for an RDPMC task
The counter value of a perf task may leak to another RDPMC task.
For example, a perf stat task as below is running on CPU 0.
perf stat -e 'branches,cycles' -- taskset -c 0 ./workload
In the meantime, an RDPMC task, which is also running on CPU 0, may read
the GP counters periodically. (The RDPMC task creates a fixed event,
but read four GP counters.)
$./rdpmc_read_all_counters
index 0x0 value 0x8001e5970f99
index 0x1 value 0x8005d750edb6
index 0x2 value 0x0
index 0x3 value 0x0
index 0x0 value 0x8002358e48a5
index 0x1 value 0x8006bd1e3bc9
index 0x2 value 0x0
index 0x3 value 0x0
It is a potential security issue. Once the attacker knows what the other
thread is counting. The PerfMon counter can be used as a side-channel to
attack cryptosystems.
The counter value of the perf stat task leaks to the RDPMC task because
perf never clears the counter when it's stopped.
Three methods were considered to address the issue.
- Unconditionally reset the counter in x86_pmu_del(). It can bring extra
overhead even when there is no RDPMC task running.
- Only reset the un-assigned dirty counters when the RDPMC task is
scheduled in via sched_task(). It fails for the below case.
Thread A Thread B
clone(CLONE_THREAD) --->
set_affine(0)
set_affine(1)
while (!event-enabled)
;
event = perf_event_open()
mmap(event)
ioctl(event, IOC_ENABLE); --->
RDPMC
Counters are still leaked to the thread B.
- Only reset the un-assigned dirty counters before updating the CR4.PCE
bit. The method is implemented here.
The dirty counter is a counter, on which the assigned event has been
deleted, but the counter is not reset. To track the dirty counters,
add a 'dirty' variable in the struct cpu_hw_events.
The security issue can only be found with an RDPMC task. To enable the
RDMPC, the CR4.PCE bit has to be updated. Add a
perf_clear_dirty_counters() right before updating the CR4.PCE bit to
clear the existing dirty counters. Only the current un-assigned dirty
counters are reset, because the RDPMC assigned dirty counters will be
updated soon.
After applying the patch,
$ ./rdpmc_read_all_counters
index 0x0 value 0x0
index 0x1 value 0x0
index 0x2 value 0x0
index 0x3 value 0x0
index 0x0 value 0x0
index 0x1 value 0x0
index 0x2 value 0x0
index 0x3 value 0x0
Performance
The performance of a context switch only be impacted when there are two
or more perf users and one of the users must be an RDPMC user. In other
cases, there is no performance impact.
The worst-case occurs when there are two users: the RDPMC user only
uses one counter; while the other user uses all available counters.
When the RDPMC task is scheduled in, all the counters, other than the
RDPMC assigned one, have to be reset.
Test results for the worst-case, using a modified lat_ctx as measured
on an Ice Lake platform, which has 8 GP and 3 FP counters (ignoring
SLOTS).
lat_ctx -s 128K -N 1000 processes 2
Without the patch:
The context switch time is 4.97 us
With the patch:
The context switch time is 5.16 us
There is ~4% performance drop for the context switching time in the
worst-case.
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1623693582-187370-1-git-send-email-kan.liang@linux.intel.com
2021-06-15 01:59:42 +08:00
|
|
|
wrmsrl(x86_pmu_event_addr(i), 0);
|
2021-07-29 17:14:57 +08:00
|
|
|
}
|
perf/x86: Reset the dirty counter to prevent the leak for an RDPMC task
The counter value of a perf task may leak to another RDPMC task.
For example, a perf stat task as below is running on CPU 0.
perf stat -e 'branches,cycles' -- taskset -c 0 ./workload
In the meantime, an RDPMC task, which is also running on CPU 0, may read
the GP counters periodically. (The RDPMC task creates a fixed event,
but read four GP counters.)
$./rdpmc_read_all_counters
index 0x0 value 0x8001e5970f99
index 0x1 value 0x8005d750edb6
index 0x2 value 0x0
index 0x3 value 0x0
index 0x0 value 0x8002358e48a5
index 0x1 value 0x8006bd1e3bc9
index 0x2 value 0x0
index 0x3 value 0x0
It is a potential security issue. Once the attacker knows what the other
thread is counting. The PerfMon counter can be used as a side-channel to
attack cryptosystems.
The counter value of the perf stat task leaks to the RDPMC task because
perf never clears the counter when it's stopped.
Three methods were considered to address the issue.
- Unconditionally reset the counter in x86_pmu_del(). It can bring extra
overhead even when there is no RDPMC task running.
- Only reset the un-assigned dirty counters when the RDPMC task is
scheduled in via sched_task(). It fails for the below case.
Thread A Thread B
clone(CLONE_THREAD) --->
set_affine(0)
set_affine(1)
while (!event-enabled)
;
event = perf_event_open()
mmap(event)
ioctl(event, IOC_ENABLE); --->
RDPMC
Counters are still leaked to the thread B.
- Only reset the un-assigned dirty counters before updating the CR4.PCE
bit. The method is implemented here.
The dirty counter is a counter, on which the assigned event has been
deleted, but the counter is not reset. To track the dirty counters,
add a 'dirty' variable in the struct cpu_hw_events.
The security issue can only be found with an RDPMC task. To enable the
RDMPC, the CR4.PCE bit has to be updated. Add a
perf_clear_dirty_counters() right before updating the CR4.PCE bit to
clear the existing dirty counters. Only the current un-assigned dirty
counters are reset, because the RDPMC assigned dirty counters will be
updated soon.
After applying the patch,
$ ./rdpmc_read_all_counters
index 0x0 value 0x0
index 0x1 value 0x0
index 0x2 value 0x0
index 0x3 value 0x0
index 0x0 value 0x0
index 0x1 value 0x0
index 0x2 value 0x0
index 0x3 value 0x0
Performance
The performance of a context switch only be impacted when there are two
or more perf users and one of the users must be an RDPMC user. In other
cases, there is no performance impact.
The worst-case occurs when there are two users: the RDPMC user only
uses one counter; while the other user uses all available counters.
When the RDPMC task is scheduled in, all the counters, other than the
RDPMC assigned one, have to be reset.
Test results for the worst-case, using a modified lat_ctx as measured
on an Ice Lake platform, which has 8 GP and 3 FP counters (ignoring
SLOTS).
lat_ctx -s 128K -N 1000 processes 2
Without the patch:
The context switch time is 4.97 us
With the patch:
The context switch time is 5.16 us
There is ~4% performance drop for the context switching time in the
worst-case.
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1623693582-187370-1-git-send-email-kan.liang@linux.intel.com
2021-06-15 01:59:42 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
bitmap_zero(cpuc->dirty, X86_PMC_IDX_MAX);
|
|
|
|
}
|
|
|
|
|
2017-08-03 01:39:30 +08:00
|
|
|
static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
|
2014-10-25 06:58:12 +08:00
|
|
|
{
|
2021-12-09 04:11:20 +08:00
|
|
|
if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT))
|
2014-10-25 06:58:12 +08:00
|
|
|
return;
|
|
|
|
|
2017-03-17 03:59:40 +08:00
|
|
|
/*
|
|
|
|
* This function relies on not being called concurrently in two
|
|
|
|
* tasks in the same mm. Otherwise one task could observe
|
|
|
|
* perf_rdpmc_allowed > 1 and return all the way back to
|
|
|
|
* userspace with CR4.PCE clear while another task is still
|
|
|
|
* doing on_each_cpu_mask() to propagate CR4.PCE.
|
|
|
|
*
|
2020-06-09 12:33:54 +08:00
|
|
|
* For now, this can't happen because all callers hold mmap_lock
|
2017-03-17 03:59:40 +08:00
|
|
|
* for write. If this changes, we'll need a different solution.
|
|
|
|
*/
|
2020-06-09 12:33:44 +08:00
|
|
|
mmap_assert_write_locked(mm);
|
2017-03-17 03:59:40 +08:00
|
|
|
|
2017-08-03 01:39:30 +08:00
|
|
|
if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1)
|
2020-04-21 17:20:30 +08:00
|
|
|
on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1);
|
2014-10-25 06:58:12 +08:00
|
|
|
}
|
|
|
|
|
2017-08-03 01:39:30 +08:00
|
|
|
static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm)
|
2014-10-25 06:58:12 +08:00
|
|
|
{
|
2021-12-09 04:11:20 +08:00
|
|
|
if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT))
|
2014-10-25 06:58:12 +08:00
|
|
|
return;
|
|
|
|
|
2017-08-03 01:39:30 +08:00
|
|
|
if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed))
|
2020-04-21 17:20:30 +08:00
|
|
|
on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1);
|
2014-10-25 06:58:12 +08:00
|
|
|
}
|
|
|
|
|
2011-11-21 03:44:06 +08:00
|
|
|
static int x86_pmu_event_idx(struct perf_event *event)
|
|
|
|
{
|
2020-07-24 01:11:04 +08:00
|
|
|
struct hw_perf_event *hwc = &event->hw;
|
2011-11-21 03:44:06 +08:00
|
|
|
|
2021-12-09 04:11:20 +08:00
|
|
|
if (!(hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT))
|
2012-03-23 00:26:36 +08:00
|
|
|
return 0;
|
|
|
|
|
2020-07-24 01:11:14 +08:00
|
|
|
if (is_metric_idx(hwc->idx))
|
|
|
|
return INTEL_PMC_FIXED_RDPMC_METRICS + 1;
|
|
|
|
else
|
|
|
|
return hwc->event_base_rdpmc + 1;
|
2011-11-21 03:44:06 +08:00
|
|
|
}
|
|
|
|
|
2011-11-21 06:30:47 +08:00
|
|
|
static ssize_t get_attr_rdpmc(struct device *cdev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
char *buf)
|
|
|
|
{
|
|
|
|
return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t set_attr_rdpmc(struct device *cdev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
const char *buf, size_t count)
|
|
|
|
{
|
2012-06-11 11:13:41 +08:00
|
|
|
unsigned long val;
|
|
|
|
ssize_t ret;
|
|
|
|
|
|
|
|
ret = kstrtoul(buf, 0, &val);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2014-02-06 03:48:51 +08:00
|
|
|
|
2014-10-25 06:58:13 +08:00
|
|
|
if (val > 2)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2014-02-06 03:48:51 +08:00
|
|
|
if (x86_pmu.attr_rdpmc_broken)
|
|
|
|
return -ENOTSUPP;
|
2011-11-21 06:30:47 +08:00
|
|
|
|
2019-11-25 13:48:38 +08:00
|
|
|
if (val != x86_pmu.attr_rdpmc) {
|
2014-10-25 06:58:13 +08:00
|
|
|
/*
|
2019-11-25 13:48:38 +08:00
|
|
|
* Changing into or out of never available or always available,
|
|
|
|
* aka perf-event-bypassing mode. This path is extremely slow,
|
2014-10-25 06:58:13 +08:00
|
|
|
* but only root can trigger it, so it's okay.
|
|
|
|
*/
|
2019-11-25 13:48:38 +08:00
|
|
|
if (val == 0)
|
|
|
|
static_branch_inc(&rdpmc_never_available_key);
|
|
|
|
else if (x86_pmu.attr_rdpmc == 0)
|
|
|
|
static_branch_dec(&rdpmc_never_available_key);
|
|
|
|
|
2014-10-25 06:58:13 +08:00
|
|
|
if (val == 2)
|
2018-03-27 05:09:27 +08:00
|
|
|
static_branch_inc(&rdpmc_always_available_key);
|
2019-11-25 13:48:38 +08:00
|
|
|
else if (x86_pmu.attr_rdpmc == 2)
|
2018-03-27 05:09:27 +08:00
|
|
|
static_branch_dec(&rdpmc_always_available_key);
|
2019-11-25 13:48:38 +08:00
|
|
|
|
2020-04-21 17:20:30 +08:00
|
|
|
on_each_cpu(cr4_update_pce, NULL, 1);
|
2019-11-25 13:48:38 +08:00
|
|
|
x86_pmu.attr_rdpmc = val;
|
2014-10-25 06:58:13 +08:00
|
|
|
}
|
|
|
|
|
2011-11-21 06:30:47 +08:00
|
|
|
return count;
|
|
|
|
}
|
|
|
|
|
|
|
|
static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
|
|
|
|
|
|
|
|
static struct attribute *x86_pmu_attrs[] = {
|
|
|
|
&dev_attr_rdpmc.attr,
|
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
2018-08-10 23:43:14 +08:00
|
|
|
static struct attribute_group x86_pmu_attr_group __ro_after_init = {
|
2011-11-21 06:30:47 +08:00
|
|
|
.attrs = x86_pmu_attrs,
|
|
|
|
};
|
|
|
|
|
2017-08-28 18:46:50 +08:00
|
|
|
static ssize_t max_precise_show(struct device *cdev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
char *buf)
|
|
|
|
{
|
|
|
|
return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise());
|
|
|
|
}
|
|
|
|
|
|
|
|
static DEVICE_ATTR_RO(max_precise);
|
|
|
|
|
|
|
|
static struct attribute *x86_pmu_caps_attrs[] = {
|
|
|
|
&dev_attr_max_precise.attr,
|
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
2018-08-10 23:43:14 +08:00
|
|
|
static struct attribute_group x86_pmu_caps_group __ro_after_init = {
|
2017-08-28 18:46:50 +08:00
|
|
|
.name = "caps",
|
|
|
|
.attrs = x86_pmu_caps_attrs,
|
|
|
|
};
|
|
|
|
|
2011-11-21 06:30:47 +08:00
|
|
|
static const struct attribute_group *x86_pmu_attr_groups[] = {
|
|
|
|
&x86_pmu_attr_group,
|
2012-03-16 03:09:14 +08:00
|
|
|
&x86_pmu_format_group,
|
2012-10-10 20:53:11 +08:00
|
|
|
&x86_pmu_events_group,
|
2017-08-23 02:52:01 +08:00
|
|
|
&x86_pmu_caps_group,
|
2011-11-21 06:30:47 +08:00
|
|
|
NULL,
|
|
|
|
};
|
|
|
|
|
perf: Rewrite core context handling
There have been various issues and limitations with the way perf uses
(task) contexts to track events. Most notable is the single hardware
PMU task context, which has resulted in a number of yucky things (both
proposed and merged).
Notably:
- HW breakpoint PMU
- ARM big.little PMU / Intel ADL PMU
- Intel Branch Monitoring PMU
- AMD IBS PMU
- S390 cpum_cf PMU
- PowerPC trace_imc PMU
*Current design:*
Currently we have a per task and per cpu perf_event_contexts:
task_struct::perf_events_ctxp[] <-> perf_event_context <-> perf_cpu_context
^ | ^ | ^
`---------------------------------' | `--> pmu ---'
v ^
perf_event ------'
Each task has an array of pointers to a perf_event_context. Each
perf_event_context has a direct relation to a PMU and a group of
events for that PMU. The task related perf_event_context's have a
pointer back to that task.
Each PMU has a per-cpu pointer to a per-cpu perf_cpu_context, which
includes a perf_event_context, which again has a direct relation to
that PMU, and a group of events for that PMU.
The perf_cpu_context also tracks which task context is currently
associated with that CPU and includes a few other things like the
hrtimer for rotation etc.
Each perf_event is then associated with its PMU and one
perf_event_context.
*Proposed design:*
New design proposed by this patch reduce to a single task context and
a single CPU context but adds some intermediate data-structures:
task_struct::perf_event_ctxp -> perf_event_context <- perf_cpu_context
^ | ^ ^
`---------------------------' | |
| | perf_cpu_pmu_context <--.
| `----. ^ |
| | | |
| v v |
| ,--> perf_event_pmu_context |
| | |
| | |
v v |
perf_event ---> pmu ----------------'
With the new design, perf_event_context will hold all events for all
pmus in the (respective pinned/flexible) rbtrees. This can be achieved
by adding pmu to rbtree key:
{cpu, pmu, cgroup, group_index}
Each perf_event_context carries a list of perf_event_pmu_context which
is used to hold per-pmu-per-context state. For example, it keeps track
of currently active events for that pmu, a pmu specific task_ctx_data,
a flag to tell whether rotation is required or not etc.
Additionally, perf_cpu_pmu_context is used to hold per-pmu-per-cpu
state like hrtimer details to drive the event rotation, a pointer to
perf_event_pmu_context of currently running task and some other
ancillary information.
Each perf_event is associated to it's pmu, perf_event_context and
perf_event_pmu_context.
Further optimizations to current implementation are possible. For
example, ctx_resched() can be optimized to reschedule only single pmu
events.
Much thanks to Ravi for picking this up and pushing it towards
completion.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Co-developed-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20221008062424.313-1-ravi.bangoria@amd.com
2022-10-08 14:24:24 +08:00
|
|
|
static void x86_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
|
2012-02-10 06:21:00 +08:00
|
|
|
{
|
perf: Rewrite core context handling
There have been various issues and limitations with the way perf uses
(task) contexts to track events. Most notable is the single hardware
PMU task context, which has resulted in a number of yucky things (both
proposed and merged).
Notably:
- HW breakpoint PMU
- ARM big.little PMU / Intel ADL PMU
- Intel Branch Monitoring PMU
- AMD IBS PMU
- S390 cpum_cf PMU
- PowerPC trace_imc PMU
*Current design:*
Currently we have a per task and per cpu perf_event_contexts:
task_struct::perf_events_ctxp[] <-> perf_event_context <-> perf_cpu_context
^ | ^ | ^
`---------------------------------' | `--> pmu ---'
v ^
perf_event ------'
Each task has an array of pointers to a perf_event_context. Each
perf_event_context has a direct relation to a PMU and a group of
events for that PMU. The task related perf_event_context's have a
pointer back to that task.
Each PMU has a per-cpu pointer to a per-cpu perf_cpu_context, which
includes a perf_event_context, which again has a direct relation to
that PMU, and a group of events for that PMU.
The perf_cpu_context also tracks which task context is currently
associated with that CPU and includes a few other things like the
hrtimer for rotation etc.
Each perf_event is then associated with its PMU and one
perf_event_context.
*Proposed design:*
New design proposed by this patch reduce to a single task context and
a single CPU context but adds some intermediate data-structures:
task_struct::perf_event_ctxp -> perf_event_context <- perf_cpu_context
^ | ^ ^
`---------------------------' | |
| | perf_cpu_pmu_context <--.
| `----. ^ |
| | | |
| v v |
| ,--> perf_event_pmu_context |
| | |
| | |
v v |
perf_event ---> pmu ----------------'
With the new design, perf_event_context will hold all events for all
pmus in the (respective pinned/flexible) rbtrees. This can be achieved
by adding pmu to rbtree key:
{cpu, pmu, cgroup, group_index}
Each perf_event_context carries a list of perf_event_pmu_context which
is used to hold per-pmu-per-context state. For example, it keeps track
of currently active events for that pmu, a pmu specific task_ctx_data,
a flag to tell whether rotation is required or not etc.
Additionally, perf_cpu_pmu_context is used to hold per-pmu-per-cpu
state like hrtimer details to drive the event rotation, a pointer to
perf_event_pmu_context of currently running task and some other
ancillary information.
Each perf_event is associated to it's pmu, perf_event_context and
perf_event_pmu_context.
Further optimizations to current implementation are possible. For
example, ctx_resched() can be optimized to reschedule only single pmu
events.
Much thanks to Ravi for picking this up and pushing it towards
completion.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Co-developed-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20221008062424.313-1-ravi.bangoria@amd.com
2022-10-08 14:24:24 +08:00
|
|
|
static_call_cond(x86_pmu_sched_task)(pmu_ctx, sched_in);
|
2012-02-10 06:21:00 +08:00
|
|
|
}
|
|
|
|
|
perf: Rewrite core context handling
There have been various issues and limitations with the way perf uses
(task) contexts to track events. Most notable is the single hardware
PMU task context, which has resulted in a number of yucky things (both
proposed and merged).
Notably:
- HW breakpoint PMU
- ARM big.little PMU / Intel ADL PMU
- Intel Branch Monitoring PMU
- AMD IBS PMU
- S390 cpum_cf PMU
- PowerPC trace_imc PMU
*Current design:*
Currently we have a per task and per cpu perf_event_contexts:
task_struct::perf_events_ctxp[] <-> perf_event_context <-> perf_cpu_context
^ | ^ | ^
`---------------------------------' | `--> pmu ---'
v ^
perf_event ------'
Each task has an array of pointers to a perf_event_context. Each
perf_event_context has a direct relation to a PMU and a group of
events for that PMU. The task related perf_event_context's have a
pointer back to that task.
Each PMU has a per-cpu pointer to a per-cpu perf_cpu_context, which
includes a perf_event_context, which again has a direct relation to
that PMU, and a group of events for that PMU.
The perf_cpu_context also tracks which task context is currently
associated with that CPU and includes a few other things like the
hrtimer for rotation etc.
Each perf_event is then associated with its PMU and one
perf_event_context.
*Proposed design:*
New design proposed by this patch reduce to a single task context and
a single CPU context but adds some intermediate data-structures:
task_struct::perf_event_ctxp -> perf_event_context <- perf_cpu_context
^ | ^ ^
`---------------------------' | |
| | perf_cpu_pmu_context <--.
| `----. ^ |
| | | |
| v v |
| ,--> perf_event_pmu_context |
| | |
| | |
v v |
perf_event ---> pmu ----------------'
With the new design, perf_event_context will hold all events for all
pmus in the (respective pinned/flexible) rbtrees. This can be achieved
by adding pmu to rbtree key:
{cpu, pmu, cgroup, group_index}
Each perf_event_context carries a list of perf_event_pmu_context which
is used to hold per-pmu-per-context state. For example, it keeps track
of currently active events for that pmu, a pmu specific task_ctx_data,
a flag to tell whether rotation is required or not etc.
Additionally, perf_cpu_pmu_context is used to hold per-pmu-per-cpu
state like hrtimer details to drive the event rotation, a pointer to
perf_event_pmu_context of currently running task and some other
ancillary information.
Each perf_event is associated to it's pmu, perf_event_context and
perf_event_pmu_context.
Further optimizations to current implementation are possible. For
example, ctx_resched() can be optimized to reschedule only single pmu
events.
Much thanks to Ravi for picking this up and pushing it towards
completion.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Co-developed-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20221008062424.313-1-ravi.bangoria@amd.com
2022-10-08 14:24:24 +08:00
|
|
|
static void x86_pmu_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
|
|
|
|
struct perf_event_pmu_context *next_epc)
|
2019-10-23 15:11:54 +08:00
|
|
|
{
|
perf: Rewrite core context handling
There have been various issues and limitations with the way perf uses
(task) contexts to track events. Most notable is the single hardware
PMU task context, which has resulted in a number of yucky things (both
proposed and merged).
Notably:
- HW breakpoint PMU
- ARM big.little PMU / Intel ADL PMU
- Intel Branch Monitoring PMU
- AMD IBS PMU
- S390 cpum_cf PMU
- PowerPC trace_imc PMU
*Current design:*
Currently we have a per task and per cpu perf_event_contexts:
task_struct::perf_events_ctxp[] <-> perf_event_context <-> perf_cpu_context
^ | ^ | ^
`---------------------------------' | `--> pmu ---'
v ^
perf_event ------'
Each task has an array of pointers to a perf_event_context. Each
perf_event_context has a direct relation to a PMU and a group of
events for that PMU. The task related perf_event_context's have a
pointer back to that task.
Each PMU has a per-cpu pointer to a per-cpu perf_cpu_context, which
includes a perf_event_context, which again has a direct relation to
that PMU, and a group of events for that PMU.
The perf_cpu_context also tracks which task context is currently
associated with that CPU and includes a few other things like the
hrtimer for rotation etc.
Each perf_event is then associated with its PMU and one
perf_event_context.
*Proposed design:*
New design proposed by this patch reduce to a single task context and
a single CPU context but adds some intermediate data-structures:
task_struct::perf_event_ctxp -> perf_event_context <- perf_cpu_context
^ | ^ ^
`---------------------------' | |
| | perf_cpu_pmu_context <--.
| `----. ^ |
| | | |
| v v |
| ,--> perf_event_pmu_context |
| | |
| | |
v v |
perf_event ---> pmu ----------------'
With the new design, perf_event_context will hold all events for all
pmus in the (respective pinned/flexible) rbtrees. This can be achieved
by adding pmu to rbtree key:
{cpu, pmu, cgroup, group_index}
Each perf_event_context carries a list of perf_event_pmu_context which
is used to hold per-pmu-per-context state. For example, it keeps track
of currently active events for that pmu, a pmu specific task_ctx_data,
a flag to tell whether rotation is required or not etc.
Additionally, perf_cpu_pmu_context is used to hold per-pmu-per-cpu
state like hrtimer details to drive the event rotation, a pointer to
perf_event_pmu_context of currently running task and some other
ancillary information.
Each perf_event is associated to it's pmu, perf_event_context and
perf_event_pmu_context.
Further optimizations to current implementation are possible. For
example, ctx_resched() can be optimized to reschedule only single pmu
events.
Much thanks to Ravi for picking this up and pushing it towards
completion.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Co-developed-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20221008062424.313-1-ravi.bangoria@amd.com
2022-10-08 14:24:24 +08:00
|
|
|
static_call_cond(x86_pmu_swap_task_ctx)(prev_epc, next_epc);
|
2019-10-23 15:11:54 +08:00
|
|
|
}
|
|
|
|
|
2012-06-08 20:50:50 +08:00
|
|
|
void perf_check_microcode(void)
|
|
|
|
{
|
|
|
|
if (x86_pmu.check_microcode)
|
|
|
|
x86_pmu.check_microcode();
|
|
|
|
}
|
|
|
|
|
2019-02-04 20:35:32 +08:00
|
|
|
static int x86_pmu_check_period(struct perf_event *event, u64 value)
|
|
|
|
{
|
|
|
|
if (x86_pmu.check_period && x86_pmu.check_period(event, value))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (value && x86_pmu.limit_period) {
|
2022-05-11 03:28:25 +08:00
|
|
|
s64 left = value;
|
|
|
|
x86_pmu.limit_period(event, &left);
|
|
|
|
if (left > value)
|
2019-02-04 20:35:32 +08:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
perf/x86/intel: Support PEBS output to PT
If PEBS declares ability to output its data to Intel PT stream, use the
aux_output attribute bit to enable PEBS data output to PT. This requires
a PT event to be present and scheduled in the same context. Unlike the
DS area, the kernel does not extract PEBS records from the PT stream to
generate corresponding records in the perf stream, because that would
require real time in-kernel PT decoding, which is not feasible. The PMI,
however, can still be used.
The output setting is per-CPU, so all PEBS events must be either writing
to PT or to the DS area, therefore, in case of conflict, the conflicting
event will fail to schedule, allowing the rotation logic to alternate
between the PEBS->PT and PEBS->DS events.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: kan.liang@linux.intel.com
Link: https://lkml.kernel.org/r/20190806084606.4021-3-alexander.shishkin@linux.intel.com
2019-08-06 16:46:01 +08:00
|
|
|
static int x86_pmu_aux_output_match(struct perf_event *event)
|
|
|
|
{
|
|
|
|
if (!(pmu.capabilities & PERF_PMU_CAP_AUX_OUTPUT))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (x86_pmu.aux_output_match)
|
|
|
|
return x86_pmu.aux_output_match(event);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
perf: Rewrite core context handling
There have been various issues and limitations with the way perf uses
(task) contexts to track events. Most notable is the single hardware
PMU task context, which has resulted in a number of yucky things (both
proposed and merged).
Notably:
- HW breakpoint PMU
- ARM big.little PMU / Intel ADL PMU
- Intel Branch Monitoring PMU
- AMD IBS PMU
- S390 cpum_cf PMU
- PowerPC trace_imc PMU
*Current design:*
Currently we have a per task and per cpu perf_event_contexts:
task_struct::perf_events_ctxp[] <-> perf_event_context <-> perf_cpu_context
^ | ^ | ^
`---------------------------------' | `--> pmu ---'
v ^
perf_event ------'
Each task has an array of pointers to a perf_event_context. Each
perf_event_context has a direct relation to a PMU and a group of
events for that PMU. The task related perf_event_context's have a
pointer back to that task.
Each PMU has a per-cpu pointer to a per-cpu perf_cpu_context, which
includes a perf_event_context, which again has a direct relation to
that PMU, and a group of events for that PMU.
The perf_cpu_context also tracks which task context is currently
associated with that CPU and includes a few other things like the
hrtimer for rotation etc.
Each perf_event is then associated with its PMU and one
perf_event_context.
*Proposed design:*
New design proposed by this patch reduce to a single task context and
a single CPU context but adds some intermediate data-structures:
task_struct::perf_event_ctxp -> perf_event_context <- perf_cpu_context
^ | ^ ^
`---------------------------' | |
| | perf_cpu_pmu_context <--.
| `----. ^ |
| | | |
| v v |
| ,--> perf_event_pmu_context |
| | |
| | |
v v |
perf_event ---> pmu ----------------'
With the new design, perf_event_context will hold all events for all
pmus in the (respective pinned/flexible) rbtrees. This can be achieved
by adding pmu to rbtree key:
{cpu, pmu, cgroup, group_index}
Each perf_event_context carries a list of perf_event_pmu_context which
is used to hold per-pmu-per-context state. For example, it keeps track
of currently active events for that pmu, a pmu specific task_ctx_data,
a flag to tell whether rotation is required or not etc.
Additionally, perf_cpu_pmu_context is used to hold per-pmu-per-cpu
state like hrtimer details to drive the event rotation, a pointer to
perf_event_pmu_context of currently running task and some other
ancillary information.
Each perf_event is associated to it's pmu, perf_event_context and
perf_event_pmu_context.
Further optimizations to current implementation are possible. For
example, ctx_resched() can be optimized to reschedule only single pmu
events.
Much thanks to Ravi for picking this up and pushing it towards
completion.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Co-developed-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20221008062424.313-1-ravi.bangoria@amd.com
2022-10-08 14:24:24 +08:00
|
|
|
static bool x86_pmu_filter(struct pmu *pmu, int cpu)
|
2021-04-12 22:30:59 +08:00
|
|
|
{
|
perf: Rewrite core context handling
There have been various issues and limitations with the way perf uses
(task) contexts to track events. Most notable is the single hardware
PMU task context, which has resulted in a number of yucky things (both
proposed and merged).
Notably:
- HW breakpoint PMU
- ARM big.little PMU / Intel ADL PMU
- Intel Branch Monitoring PMU
- AMD IBS PMU
- S390 cpum_cf PMU
- PowerPC trace_imc PMU
*Current design:*
Currently we have a per task and per cpu perf_event_contexts:
task_struct::perf_events_ctxp[] <-> perf_event_context <-> perf_cpu_context
^ | ^ | ^
`---------------------------------' | `--> pmu ---'
v ^
perf_event ------'
Each task has an array of pointers to a perf_event_context. Each
perf_event_context has a direct relation to a PMU and a group of
events for that PMU. The task related perf_event_context's have a
pointer back to that task.
Each PMU has a per-cpu pointer to a per-cpu perf_cpu_context, which
includes a perf_event_context, which again has a direct relation to
that PMU, and a group of events for that PMU.
The perf_cpu_context also tracks which task context is currently
associated with that CPU and includes a few other things like the
hrtimer for rotation etc.
Each perf_event is then associated with its PMU and one
perf_event_context.
*Proposed design:*
New design proposed by this patch reduce to a single task context and
a single CPU context but adds some intermediate data-structures:
task_struct::perf_event_ctxp -> perf_event_context <- perf_cpu_context
^ | ^ ^
`---------------------------' | |
| | perf_cpu_pmu_context <--.
| `----. ^ |
| | | |
| v v |
| ,--> perf_event_pmu_context |
| | |
| | |
v v |
perf_event ---> pmu ----------------'
With the new design, perf_event_context will hold all events for all
pmus in the (respective pinned/flexible) rbtrees. This can be achieved
by adding pmu to rbtree key:
{cpu, pmu, cgroup, group_index}
Each perf_event_context carries a list of perf_event_pmu_context which
is used to hold per-pmu-per-context state. For example, it keeps track
of currently active events for that pmu, a pmu specific task_ctx_data,
a flag to tell whether rotation is required or not etc.
Additionally, perf_cpu_pmu_context is used to hold per-pmu-per-cpu
state like hrtimer details to drive the event rotation, a pointer to
perf_event_pmu_context of currently running task and some other
ancillary information.
Each perf_event is associated to it's pmu, perf_event_context and
perf_event_pmu_context.
Further optimizations to current implementation are possible. For
example, ctx_resched() can be optimized to reschedule only single pmu
events.
Much thanks to Ravi for picking this up and pushing it towards
completion.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Co-developed-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20221008062424.313-1-ravi.bangoria@amd.com
2022-10-08 14:24:24 +08:00
|
|
|
bool ret = false;
|
|
|
|
|
|
|
|
static_call_cond(x86_pmu_filter)(pmu, cpu, &ret);
|
2021-04-12 22:30:59 +08:00
|
|
|
|
perf: Rewrite core context handling
There have been various issues and limitations with the way perf uses
(task) contexts to track events. Most notable is the single hardware
PMU task context, which has resulted in a number of yucky things (both
proposed and merged).
Notably:
- HW breakpoint PMU
- ARM big.little PMU / Intel ADL PMU
- Intel Branch Monitoring PMU
- AMD IBS PMU
- S390 cpum_cf PMU
- PowerPC trace_imc PMU
*Current design:*
Currently we have a per task and per cpu perf_event_contexts:
task_struct::perf_events_ctxp[] <-> perf_event_context <-> perf_cpu_context
^ | ^ | ^
`---------------------------------' | `--> pmu ---'
v ^
perf_event ------'
Each task has an array of pointers to a perf_event_context. Each
perf_event_context has a direct relation to a PMU and a group of
events for that PMU. The task related perf_event_context's have a
pointer back to that task.
Each PMU has a per-cpu pointer to a per-cpu perf_cpu_context, which
includes a perf_event_context, which again has a direct relation to
that PMU, and a group of events for that PMU.
The perf_cpu_context also tracks which task context is currently
associated with that CPU and includes a few other things like the
hrtimer for rotation etc.
Each perf_event is then associated with its PMU and one
perf_event_context.
*Proposed design:*
New design proposed by this patch reduce to a single task context and
a single CPU context but adds some intermediate data-structures:
task_struct::perf_event_ctxp -> perf_event_context <- perf_cpu_context
^ | ^ ^
`---------------------------' | |
| | perf_cpu_pmu_context <--.
| `----. ^ |
| | | |
| v v |
| ,--> perf_event_pmu_context |
| | |
| | |
v v |
perf_event ---> pmu ----------------'
With the new design, perf_event_context will hold all events for all
pmus in the (respective pinned/flexible) rbtrees. This can be achieved
by adding pmu to rbtree key:
{cpu, pmu, cgroup, group_index}
Each perf_event_context carries a list of perf_event_pmu_context which
is used to hold per-pmu-per-context state. For example, it keeps track
of currently active events for that pmu, a pmu specific task_ctx_data,
a flag to tell whether rotation is required or not etc.
Additionally, perf_cpu_pmu_context is used to hold per-pmu-per-cpu
state like hrtimer details to drive the event rotation, a pointer to
perf_event_pmu_context of currently running task and some other
ancillary information.
Each perf_event is associated to it's pmu, perf_event_context and
perf_event_pmu_context.
Further optimizations to current implementation are possible. For
example, ctx_resched() can be optimized to reschedule only single pmu
events.
Much thanks to Ravi for picking this up and pushing it towards
completion.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Co-developed-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20221008062424.313-1-ravi.bangoria@amd.com
2022-10-08 14:24:24 +08:00
|
|
|
return ret;
|
2021-04-12 22:30:59 +08:00
|
|
|
}
|
|
|
|
|
2010-06-11 19:35:08 +08:00
|
|
|
static struct pmu pmu = {
|
2012-02-10 06:21:00 +08:00
|
|
|
.pmu_enable = x86_pmu_enable,
|
|
|
|
.pmu_disable = x86_pmu_disable,
|
perf: Rework the PMU methods
Replace pmu::{enable,disable,start,stop,unthrottle} with
pmu::{add,del,start,stop}, all of which take a flags argument.
The new interface extends the capability to stop a counter while
keeping it scheduled on the PMU. We replace the throttled state with
the generic stopped state.
This also allows us to efficiently stop/start counters over certain
code paths (like IRQ handlers).
It also allows scheduling a counter without it starting, allowing for
a generic frozen state (useful for rotating stopped counters).
The stopped state is implemented in two different ways, depending on
how the architecture implemented the throttled state:
1) We disable the counter:
a) the pmu has per-counter enable bits, we flip that
b) we program a NOP event, preserving the counter state
2) We store the counter state and ignore all read/overflow events
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-06-16 20:37:10 +08:00
|
|
|
|
2012-06-08 20:50:50 +08:00
|
|
|
.attr_groups = x86_pmu_attr_groups,
|
2011-11-21 06:30:47 +08:00
|
|
|
|
2012-06-08 20:50:50 +08:00
|
|
|
.event_init = x86_pmu_event_init,
|
perf: Rework the PMU methods
Replace pmu::{enable,disable,start,stop,unthrottle} with
pmu::{add,del,start,stop}, all of which take a flags argument.
The new interface extends the capability to stop a counter while
keeping it scheduled on the PMU. We replace the throttled state with
the generic stopped state.
This also allows us to efficiently stop/start counters over certain
code paths (like IRQ handlers).
It also allows scheduling a counter without it starting, allowing for
a generic frozen state (useful for rotating stopped counters).
The stopped state is implemented in two different ways, depending on
how the architecture implemented the throttled state:
1) We disable the counter:
a) the pmu has per-counter enable bits, we flip that
b) we program a NOP event, preserving the counter state
2) We store the counter state and ignore all read/overflow events
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-06-16 20:37:10 +08:00
|
|
|
|
2014-10-25 06:58:12 +08:00
|
|
|
.event_mapped = x86_pmu_event_mapped,
|
|
|
|
.event_unmapped = x86_pmu_event_unmapped,
|
|
|
|
|
2012-02-10 06:21:00 +08:00
|
|
|
.add = x86_pmu_add,
|
|
|
|
.del = x86_pmu_del,
|
|
|
|
.start = x86_pmu_start,
|
|
|
|
.stop = x86_pmu_stop,
|
|
|
|
.read = x86_pmu_read,
|
perf: Rework the PMU methods
Replace pmu::{enable,disable,start,stop,unthrottle} with
pmu::{add,del,start,stop}, all of which take a flags argument.
The new interface extends the capability to stop a counter while
keeping it scheduled on the PMU. We replace the throttled state with
the generic stopped state.
This also allows us to efficiently stop/start counters over certain
code paths (like IRQ handlers).
It also allows scheduling a counter without it starting, allowing for
a generic frozen state (useful for rotating stopped counters).
The stopped state is implemented in two different ways, depending on
how the architecture implemented the throttled state:
1) We disable the counter:
a) the pmu has per-counter enable bits, we flip that
b) we program a NOP event, preserving the counter state
2) We store the counter state and ignore all read/overflow events
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Yanmin <yanmin_zhang@linux.intel.com>
Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com>
Cc: David Miller <davem@davemloft.net>
Cc: Michael Cree <mcree@orcon.net.nz>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-06-16 20:37:10 +08:00
|
|
|
|
2012-06-08 20:50:50 +08:00
|
|
|
.start_txn = x86_pmu_start_txn,
|
|
|
|
.cancel_txn = x86_pmu_cancel_txn,
|
|
|
|
.commit_txn = x86_pmu_commit_txn,
|
2011-11-21 03:44:06 +08:00
|
|
|
|
2012-06-08 20:50:50 +08:00
|
|
|
.event_idx = x86_pmu_event_idx,
|
2014-11-05 10:55:58 +08:00
|
|
|
.sched_task = x86_pmu_sched_task,
|
2019-10-23 15:11:54 +08:00
|
|
|
.swap_task_ctx = x86_pmu_swap_task_ctx,
|
2019-02-04 20:35:32 +08:00
|
|
|
.check_period = x86_pmu_check_period,
|
perf/x86/intel: Support PEBS output to PT
If PEBS declares ability to output its data to Intel PT stream, use the
aux_output attribute bit to enable PEBS data output to PT. This requires
a PT event to be present and scheduled in the same context. Unlike the
DS area, the kernel does not extract PEBS records from the PT stream to
generate corresponding records in the perf stream, because that would
require real time in-kernel PT decoding, which is not feasible. The PMI,
however, can still be used.
The output setting is per-CPU, so all PEBS events must be either writing
to PT or to the DS area, therefore, in case of conflict, the conflicting
event will fail to schedule, allowing the rotation logic to alternate
between the PEBS->PT and PEBS->DS events.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: kan.liang@linux.intel.com
Link: https://lkml.kernel.org/r/20190806084606.4021-3-alexander.shishkin@linux.intel.com
2019-08-06 16:46:01 +08:00
|
|
|
|
|
|
|
.aux_output_match = x86_pmu_aux_output_match,
|
2021-04-12 22:30:59 +08:00
|
|
|
|
perf: Rewrite core context handling
There have been various issues and limitations with the way perf uses
(task) contexts to track events. Most notable is the single hardware
PMU task context, which has resulted in a number of yucky things (both
proposed and merged).
Notably:
- HW breakpoint PMU
- ARM big.little PMU / Intel ADL PMU
- Intel Branch Monitoring PMU
- AMD IBS PMU
- S390 cpum_cf PMU
- PowerPC trace_imc PMU
*Current design:*
Currently we have a per task and per cpu perf_event_contexts:
task_struct::perf_events_ctxp[] <-> perf_event_context <-> perf_cpu_context
^ | ^ | ^
`---------------------------------' | `--> pmu ---'
v ^
perf_event ------'
Each task has an array of pointers to a perf_event_context. Each
perf_event_context has a direct relation to a PMU and a group of
events for that PMU. The task related perf_event_context's have a
pointer back to that task.
Each PMU has a per-cpu pointer to a per-cpu perf_cpu_context, which
includes a perf_event_context, which again has a direct relation to
that PMU, and a group of events for that PMU.
The perf_cpu_context also tracks which task context is currently
associated with that CPU and includes a few other things like the
hrtimer for rotation etc.
Each perf_event is then associated with its PMU and one
perf_event_context.
*Proposed design:*
New design proposed by this patch reduce to a single task context and
a single CPU context but adds some intermediate data-structures:
task_struct::perf_event_ctxp -> perf_event_context <- perf_cpu_context
^ | ^ ^
`---------------------------' | |
| | perf_cpu_pmu_context <--.
| `----. ^ |
| | | |
| v v |
| ,--> perf_event_pmu_context |
| | |
| | |
v v |
perf_event ---> pmu ----------------'
With the new design, perf_event_context will hold all events for all
pmus in the (respective pinned/flexible) rbtrees. This can be achieved
by adding pmu to rbtree key:
{cpu, pmu, cgroup, group_index}
Each perf_event_context carries a list of perf_event_pmu_context which
is used to hold per-pmu-per-context state. For example, it keeps track
of currently active events for that pmu, a pmu specific task_ctx_data,
a flag to tell whether rotation is required or not etc.
Additionally, perf_cpu_pmu_context is used to hold per-pmu-per-cpu
state like hrtimer details to drive the event rotation, a pointer to
perf_event_pmu_context of currently running task and some other
ancillary information.
Each perf_event is associated to it's pmu, perf_event_context and
perf_event_pmu_context.
Further optimizations to current implementation are possible. For
example, ctx_resched() can be optimized to reschedule only single pmu
events.
Much thanks to Ravi for picking this up and pushing it towards
completion.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Co-developed-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20221008062424.313-1-ravi.bangoria@amd.com
2022-10-08 14:24:24 +08:00
|
|
|
.filter = x86_pmu_filter,
|
2010-06-11 19:35:08 +08:00
|
|
|
};
|
|
|
|
|
2014-10-25 06:58:11 +08:00
|
|
|
void arch_perf_update_userpage(struct perf_event *event,
|
|
|
|
struct perf_event_mmap_page *userpg, u64 now)
|
2011-11-21 18:43:53 +08:00
|
|
|
{
|
2017-05-02 19:22:07 +08:00
|
|
|
struct cyc2ns_data data;
|
2017-03-17 19:48:18 +08:00
|
|
|
u64 offset;
|
2013-11-29 22:40:29 +08:00
|
|
|
|
perf: Fix capabilities bitfield compatibility in 'struct perf_event_mmap_page'
Solve the problems around the broken definition of perf_event_mmap_page::
cap_usr_time and cap_usr_rdpmc fields which used to overlap, partially
fixed by:
860f085b74e9 ("perf: Fix broken union in 'struct perf_event_mmap_page'")
The problem with the fix (merged in v3.12-rc1 and not yet released
officially), noticed by Vince Weaver is that the new behavior is
not detectable by new user-space, and that due to the reuse of the
field names it's easy to mis-compile a binary if old headers are used
on a new kernel or new headers are used on an old kernel.
To solve all that make this change explicit, detectable and self-contained,
by iterating the ABI the following way:
- Always clear bit 0, and rename it to usrpage->cap_bit0, to at least not
confuse old user-space binaries. RDPMC will be marked as unavailable
to old binaries but that's within the ABI, this is a capability bit.
- Rename bit 1 to ->cap_bit0_is_deprecated and always set it to 1, so new
libraries can reliably detect that bit 0 is deprecated and perma-zero
without having to check the kernel version.
- Use bits 2, 3, 4 for the newly defined, correct functionality:
cap_user_rdpmc : 1, /* The RDPMC instruction can be used to read counts */
cap_user_time : 1, /* The time_* fields are used */
cap_user_time_zero : 1, /* The time_zero field is used */
- Rename all the bitfield names in perf_event.h to be different from the
old names, to make sure it's not possible to mis-compile it
accidentally with old assumptions.
The 'size' field can then be used in the future to add new fields and it
will act as a natural ABI version indicator as well.
Also adjust tools/perf/ userspace for the new definitions, noticed by
Adrian Hunter.
Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Also-Fixed-by: Adrian Hunter <adrian.hunter@intel.com>
Link: http://lkml.kernel.org/n/tip-zr03yxjrpXesOzzupszqglbv@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-09-19 16:16:42 +08:00
|
|
|
userpg->cap_user_time = 0;
|
|
|
|
userpg->cap_user_time_zero = 0;
|
2014-10-25 06:58:12 +08:00
|
|
|
userpg->cap_user_rdpmc =
|
2021-12-09 04:11:20 +08:00
|
|
|
!!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT);
|
2012-03-23 00:26:36 +08:00
|
|
|
userpg->pmc_width = x86_pmu.cntval_bits;
|
|
|
|
|
2017-03-17 19:48:18 +08:00
|
|
|
if (!using_native_sched_clock() || !sched_clock_stable())
|
2011-11-21 18:43:53 +08:00
|
|
|
return;
|
|
|
|
|
2017-05-02 19:22:07 +08:00
|
|
|
cyc2ns_read_begin(&data);
|
2013-11-29 22:40:29 +08:00
|
|
|
|
2017-05-02 19:22:07 +08:00
|
|
|
offset = data.cyc2ns_offset + __sched_clock_offset;
|
2017-03-17 19:48:18 +08:00
|
|
|
|
2015-02-20 21:05:38 +08:00
|
|
|
/*
|
|
|
|
* Internal timekeeping for enabled/running/stopped times
|
|
|
|
* is always in the local_clock domain.
|
|
|
|
*/
|
perf: Fix capabilities bitfield compatibility in 'struct perf_event_mmap_page'
Solve the problems around the broken definition of perf_event_mmap_page::
cap_usr_time and cap_usr_rdpmc fields which used to overlap, partially
fixed by:
860f085b74e9 ("perf: Fix broken union in 'struct perf_event_mmap_page'")
The problem with the fix (merged in v3.12-rc1 and not yet released
officially), noticed by Vince Weaver is that the new behavior is
not detectable by new user-space, and that due to the reuse of the
field names it's easy to mis-compile a binary if old headers are used
on a new kernel or new headers are used on an old kernel.
To solve all that make this change explicit, detectable and self-contained,
by iterating the ABI the following way:
- Always clear bit 0, and rename it to usrpage->cap_bit0, to at least not
confuse old user-space binaries. RDPMC will be marked as unavailable
to old binaries but that's within the ABI, this is a capability bit.
- Rename bit 1 to ->cap_bit0_is_deprecated and always set it to 1, so new
libraries can reliably detect that bit 0 is deprecated and perma-zero
without having to check the kernel version.
- Use bits 2, 3, 4 for the newly defined, correct functionality:
cap_user_rdpmc : 1, /* The RDPMC instruction can be used to read counts */
cap_user_time : 1, /* The time_* fields are used */
cap_user_time_zero : 1, /* The time_zero field is used */
- Rename all the bitfield names in perf_event.h to be different from the
old names, to make sure it's not possible to mis-compile it
accidentally with old assumptions.
The 'size' field can then be used in the future to add new fields and it
will act as a natural ABI version indicator as well.
Also adjust tools/perf/ userspace for the new definitions, noticed by
Adrian Hunter.
Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Also-Fixed-by: Adrian Hunter <adrian.hunter@intel.com>
Link: http://lkml.kernel.org/n/tip-zr03yxjrpXesOzzupszqglbv@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-09-19 16:16:42 +08:00
|
|
|
userpg->cap_user_time = 1;
|
2017-05-02 19:22:07 +08:00
|
|
|
userpg->time_mult = data.cyc2ns_mul;
|
|
|
|
userpg->time_shift = data.cyc2ns_shift;
|
2017-03-17 19:48:18 +08:00
|
|
|
userpg->time_offset = offset - now;
|
2013-06-28 21:22:18 +08:00
|
|
|
|
2015-02-20 21:05:38 +08:00
|
|
|
/*
|
|
|
|
* cap_user_time_zero doesn't make sense when we're using a different
|
|
|
|
* time base for the records.
|
|
|
|
*/
|
2016-04-14 19:59:49 +08:00
|
|
|
if (!event->attr.use_clockid) {
|
2015-02-20 21:05:38 +08:00
|
|
|
userpg->cap_user_time_zero = 1;
|
2017-03-17 19:48:18 +08:00
|
|
|
userpg->time_zero = offset;
|
2015-02-20 21:05:38 +08:00
|
|
|
}
|
2013-11-29 22:40:29 +08:00
|
|
|
|
2017-05-02 19:22:07 +08:00
|
|
|
cyc2ns_read_end();
|
2011-11-21 18:43:53 +08:00
|
|
|
}
|
|
|
|
|
2019-04-23 00:26:52 +08:00
|
|
|
/*
|
|
|
|
* Determine whether the regs were taken from an irq/exception handler rather
|
|
|
|
* than from perf_arch_fetch_caller_regs().
|
|
|
|
*/
|
|
|
|
static bool perf_hw_regs(struct pt_regs *regs)
|
|
|
|
{
|
|
|
|
return regs->flags & X86_EFLAGS_FIXED;
|
|
|
|
}
|
|
|
|
|
2010-07-01 05:03:51 +08:00
|
|
|
void
|
2016-04-28 23:30:53 +08:00
|
|
|
perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
|
2009-03-31 01:07:15 +08:00
|
|
|
{
|
2016-09-17 03:18:13 +08:00
|
|
|
struct unwind_state state;
|
|
|
|
unsigned long addr;
|
|
|
|
|
2021-11-11 10:07:28 +08:00
|
|
|
if (perf_guest_state()) {
|
2010-07-01 22:20:36 +08:00
|
|
|
/* TODO: We don't support guest os callchain now */
|
2010-08-20 20:30:41 +08:00
|
|
|
return;
|
2010-07-01 22:20:36 +08:00
|
|
|
}
|
|
|
|
|
2019-06-27 08:33:52 +08:00
|
|
|
if (perf_callchain_store(entry, regs->ip))
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (perf_hw_regs(regs))
|
2019-04-23 00:26:52 +08:00
|
|
|
unwind_start(&state, current, regs, NULL);
|
2019-06-27 08:33:52 +08:00
|
|
|
else
|
2019-04-23 00:26:52 +08:00
|
|
|
unwind_start(&state, current, NULL, (void *)regs->sp);
|
2009-03-31 01:07:15 +08:00
|
|
|
|
2019-04-23 00:26:52 +08:00
|
|
|
for (; !unwind_done(&state); unwind_next_frame(&state)) {
|
2016-09-17 03:18:13 +08:00
|
|
|
addr = unwind_get_return_address(&state);
|
|
|
|
if (!addr || perf_callchain_store(entry, addr))
|
|
|
|
return;
|
|
|
|
}
|
2009-03-31 01:07:15 +08:00
|
|
|
}
|
|
|
|
|
2012-04-21 06:41:35 +08:00
|
|
|
static inline int
|
|
|
|
valid_user_frame(const void __user *fp, unsigned long size)
|
|
|
|
{
|
2022-02-15 16:15:57 +08:00
|
|
|
return __access_ok(fp, size);
|
2012-04-21 06:41:35 +08:00
|
|
|
}
|
|
|
|
|
2012-07-10 15:42:15 +08:00
|
|
|
static unsigned long get_segment_base(unsigned int segment)
|
|
|
|
{
|
|
|
|
struct desc_struct *desc;
|
2016-12-10 07:13:51 +08:00
|
|
|
unsigned int idx = segment >> 3;
|
2012-07-10 15:42:15 +08:00
|
|
|
|
|
|
|
if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
|
2015-07-31 05:31:34 +08:00
|
|
|
#ifdef CONFIG_MODIFY_LDT_SYSCALL
|
2015-07-31 05:31:32 +08:00
|
|
|
struct ldt_struct *ldt;
|
|
|
|
|
|
|
|
/* IRQs are off, so this synchronizes with smp_store_release */
|
2017-10-24 18:22:48 +08:00
|
|
|
ldt = READ_ONCE(current->active_mm->context.ldt);
|
2017-08-18 18:30:30 +08:00
|
|
|
if (!ldt || idx >= ldt->nr_entries)
|
2012-07-10 15:42:15 +08:00
|
|
|
return 0;
|
|
|
|
|
2015-07-31 05:31:32 +08:00
|
|
|
desc = &ldt->entries[idx];
|
2015-07-31 05:31:34 +08:00
|
|
|
#else
|
|
|
|
return 0;
|
|
|
|
#endif
|
2012-07-10 15:42:15 +08:00
|
|
|
} else {
|
2017-08-18 18:30:30 +08:00
|
|
|
if (idx >= GDT_ENTRIES)
|
2012-07-10 15:42:15 +08:00
|
|
|
return 0;
|
|
|
|
|
2015-07-31 05:31:32 +08:00
|
|
|
desc = raw_cpu_ptr(gdt_page.gdt) + idx;
|
2012-07-10 15:42:15 +08:00
|
|
|
}
|
|
|
|
|
2015-07-31 05:31:32 +08:00
|
|
|
return get_desc_base(desc);
|
2012-07-10 15:42:15 +08:00
|
|
|
}
|
|
|
|
|
2015-06-22 19:55:17 +08:00
|
|
|
#ifdef CONFIG_IA32_EMULATION
|
2012-02-20 02:06:34 +08:00
|
|
|
|
2018-03-14 12:03:25 +08:00
|
|
|
#include <linux/compat.h>
|
2012-02-20 02:06:34 +08:00
|
|
|
|
2010-03-17 18:07:16 +08:00
|
|
|
static inline int
|
2016-04-28 23:30:53 +08:00
|
|
|
perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
|
2009-06-15 19:07:24 +08:00
|
|
|
{
|
2010-03-17 18:07:16 +08:00
|
|
|
/* 32-bit process in 64-bit kernel. */
|
2012-07-10 15:42:15 +08:00
|
|
|
unsigned long ss_base, cs_base;
|
2010-03-17 18:07:16 +08:00
|
|
|
struct stack_frame_ia32 frame;
|
2020-02-16 00:28:09 +08:00
|
|
|
const struct stack_frame_ia32 __user *fp;
|
2009-06-15 19:07:24 +08:00
|
|
|
|
2020-10-04 11:25:27 +08:00
|
|
|
if (user_64bit_mode(regs))
|
2010-03-17 18:07:16 +08:00
|
|
|
return 0;
|
|
|
|
|
2012-07-10 15:42:15 +08:00
|
|
|
cs_base = get_segment_base(regs->cs);
|
|
|
|
ss_base = get_segment_base(regs->ss);
|
|
|
|
|
|
|
|
fp = compat_ptr(ss_base + regs->bp);
|
2015-10-23 06:07:21 +08:00
|
|
|
pagefault_disable();
|
perf core: Add a 'nr' field to perf_event_callchain_context
We will use it to count how many addresses are in the entry->ip[] array,
excluding PERF_CONTEXT_{KERNEL,USER,etc} entries, so that we can really
return the number of entries specified by the user via the relevant
sysctl, kernel.perf_event_max_contexts, or via the per event
perf_event_attr.sample_max_stack knob.
This way we keep the perf_sample->ip_callchain->nr meaning, that is the
number of entries, be it real addresses or PERF_CONTEXT_ entries, while
honouring the max_stack knobs, i.e. the end result will be max_stack
entries if we have at least that many entries in a given stack trace.
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/n/tip-s8teto51tdqvlfhefndtat9r@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-05-11 05:08:32 +08:00
|
|
|
while (entry->nr < entry->max_stack) {
|
2016-11-22 17:57:42 +08:00
|
|
|
if (!valid_user_frame(fp, sizeof(frame)))
|
2015-10-23 06:07:21 +08:00
|
|
|
break;
|
|
|
|
|
2020-02-16 00:28:09 +08:00
|
|
|
if (__get_user(frame.next_frame, &fp->next_frame))
|
2015-10-23 06:07:21 +08:00
|
|
|
break;
|
2020-02-16 00:28:09 +08:00
|
|
|
if (__get_user(frame.return_address, &fp->return_address))
|
2010-03-17 18:07:16 +08:00
|
|
|
break;
|
2009-06-15 19:07:24 +08:00
|
|
|
|
2012-07-10 15:42:15 +08:00
|
|
|
perf_callchain_store(entry, cs_base + frame.return_address);
|
|
|
|
fp = compat_ptr(ss_base + frame.next_frame);
|
2010-03-17 18:07:16 +08:00
|
|
|
}
|
2015-10-23 06:07:21 +08:00
|
|
|
pagefault_enable();
|
2010-03-17 18:07:16 +08:00
|
|
|
return 1;
|
2009-03-31 01:07:15 +08:00
|
|
|
}
|
2010-03-17 18:07:16 +08:00
|
|
|
#else
|
|
|
|
static inline int
|
2016-04-28 23:30:53 +08:00
|
|
|
perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
|
2010-03-17 18:07:16 +08:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif
|
2009-03-31 01:07:15 +08:00
|
|
|
|
2010-07-01 05:03:51 +08:00
|
|
|
void
|
2016-04-28 23:30:53 +08:00
|
|
|
perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
|
2009-03-31 01:07:15 +08:00
|
|
|
{
|
|
|
|
struct stack_frame frame;
|
2020-02-16 00:28:09 +08:00
|
|
|
const struct stack_frame __user *fp;
|
2009-03-31 01:07:15 +08:00
|
|
|
|
2021-11-11 10:07:28 +08:00
|
|
|
if (perf_guest_state()) {
|
2010-07-01 22:20:36 +08:00
|
|
|
/* TODO: We don't support guest os callchain now */
|
2010-08-20 20:30:41 +08:00
|
|
|
return;
|
2010-07-01 22:20:36 +08:00
|
|
|
}
|
2009-05-29 17:25:09 +08:00
|
|
|
|
2012-07-10 15:42:15 +08:00
|
|
|
/*
|
|
|
|
* We don't know what to do with VM86 stacks.. ignore them for now.
|
|
|
|
*/
|
|
|
|
if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM))
|
|
|
|
return;
|
|
|
|
|
2020-02-16 00:28:09 +08:00
|
|
|
fp = (void __user *)regs->bp;
|
2009-03-31 01:07:15 +08:00
|
|
|
|
2010-06-30 01:34:05 +08:00
|
|
|
perf_callchain_store(entry, regs->ip);
|
2009-03-31 01:07:15 +08:00
|
|
|
|
2018-08-29 23:47:18 +08:00
|
|
|
if (!nmi_uaccess_okay())
|
2011-08-30 16:32:36 +08:00
|
|
|
return;
|
|
|
|
|
2010-03-17 18:07:16 +08:00
|
|
|
if (perf_callchain_user32(regs, entry))
|
|
|
|
return;
|
|
|
|
|
2015-10-23 06:07:21 +08:00
|
|
|
pagefault_disable();
|
perf core: Add a 'nr' field to perf_event_callchain_context
We will use it to count how many addresses are in the entry->ip[] array,
excluding PERF_CONTEXT_{KERNEL,USER,etc} entries, so that we can really
return the number of entries specified by the user via the relevant
sysctl, kernel.perf_event_max_contexts, or via the per event
perf_event_attr.sample_max_stack knob.
This way we keep the perf_sample->ip_callchain->nr meaning, that is the
number of entries, be it real addresses or PERF_CONTEXT_ entries, while
honouring the max_stack knobs, i.e. the end result will be max_stack
entries if we have at least that many entries in a given stack trace.
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/n/tip-s8teto51tdqvlfhefndtat9r@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2016-05-11 05:08:32 +08:00
|
|
|
while (entry->nr < entry->max_stack) {
|
2016-11-22 17:57:42 +08:00
|
|
|
if (!valid_user_frame(fp, sizeof(frame)))
|
2015-10-23 06:07:21 +08:00
|
|
|
break;
|
|
|
|
|
2020-02-16 00:28:09 +08:00
|
|
|
if (__get_user(frame.next_frame, &fp->next_frame))
|
2015-10-23 06:07:21 +08:00
|
|
|
break;
|
2020-02-16 00:28:09 +08:00
|
|
|
if (__get_user(frame.return_address, &fp->return_address))
|
2009-03-31 01:07:15 +08:00
|
|
|
break;
|
|
|
|
|
2010-06-30 01:34:05 +08:00
|
|
|
perf_callchain_store(entry, frame.return_address);
|
2015-10-23 06:07:21 +08:00
|
|
|
fp = (void __user *)frame.next_frame;
|
2009-03-31 01:07:15 +08:00
|
|
|
}
|
2015-10-23 06:07:21 +08:00
|
|
|
pagefault_enable();
|
2009-03-31 01:07:15 +08:00
|
|
|
}
|
|
|
|
|
2012-07-10 15:42:15 +08:00
|
|
|
/*
|
|
|
|
* Deal with code segment offsets for the various execution modes:
|
|
|
|
*
|
|
|
|
* VM86 - the good olde 16 bit days, where the linear address is
|
|
|
|
* 20 bits and we use regs->ip + 0x10 * regs->cs.
|
|
|
|
*
|
|
|
|
* IA32 - Where we need to look at GDT/LDT segment descriptor tables
|
|
|
|
* to figure out what the 32bit base address is.
|
|
|
|
*
|
|
|
|
* X32 - has TIF_X32 set, but is running in x86_64
|
|
|
|
*
|
|
|
|
* X86_64 - CS,DS,SS,ES are all zero based.
|
|
|
|
*/
|
|
|
|
static unsigned long code_segment_base(struct pt_regs *regs)
|
2010-04-19 13:32:41 +08:00
|
|
|
{
|
2015-03-19 09:33:30 +08:00
|
|
|
/*
|
|
|
|
* For IA32 we look at the GDT/LDT segment base to convert the
|
|
|
|
* effective IP to a linear address.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifdef CONFIG_X86_32
|
2012-07-10 15:42:15 +08:00
|
|
|
/*
|
|
|
|
* If we are in VM86 mode, add the segment offset to convert to a
|
|
|
|
* linear address.
|
|
|
|
*/
|
|
|
|
if (regs->flags & X86_VM_MASK)
|
|
|
|
return 0x10 * regs->cs;
|
|
|
|
|
2015-03-29 17:02:34 +08:00
|
|
|
if (user_mode(regs) && regs->cs != __USER_CS)
|
2012-07-10 15:42:15 +08:00
|
|
|
return get_segment_base(regs->cs);
|
|
|
|
#else
|
2015-03-19 09:33:28 +08:00
|
|
|
if (user_mode(regs) && !user_64bit_mode(regs) &&
|
|
|
|
regs->cs != __USER32_CS)
|
|
|
|
return get_segment_base(regs->cs);
|
2012-07-10 15:42:15 +08:00
|
|
|
#endif
|
|
|
|
return 0;
|
|
|
|
}
|
2010-04-20 10:13:58 +08:00
|
|
|
|
2012-07-10 15:42:15 +08:00
|
|
|
unsigned long perf_instruction_pointer(struct pt_regs *regs)
|
|
|
|
{
|
2021-11-11 10:07:28 +08:00
|
|
|
if (perf_guest_state())
|
|
|
|
return perf_guest_get_ip();
|
2010-04-20 10:13:58 +08:00
|
|
|
|
2012-07-10 15:42:15 +08:00
|
|
|
return regs->ip + code_segment_base(regs);
|
2010-04-19 13:32:41 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
unsigned long perf_misc_flags(struct pt_regs *regs)
|
|
|
|
{
|
2021-11-11 10:07:28 +08:00
|
|
|
unsigned int guest_state = perf_guest_state();
|
2010-04-19 13:32:41 +08:00
|
|
|
int misc = 0;
|
2010-04-20 10:13:58 +08:00
|
|
|
|
2021-11-11 10:07:27 +08:00
|
|
|
if (guest_state) {
|
|
|
|
if (guest_state & PERF_GUEST_USER)
|
2010-04-20 10:13:58 +08:00
|
|
|
misc |= PERF_RECORD_MISC_GUEST_USER;
|
|
|
|
else
|
|
|
|
misc |= PERF_RECORD_MISC_GUEST_KERNEL;
|
|
|
|
} else {
|
2012-07-10 15:42:15 +08:00
|
|
|
if (user_mode(regs))
|
2010-04-20 10:13:58 +08:00
|
|
|
misc |= PERF_RECORD_MISC_USER;
|
|
|
|
else
|
|
|
|
misc |= PERF_RECORD_MISC_KERNEL;
|
|
|
|
}
|
|
|
|
|
2010-04-19 13:32:41 +08:00
|
|
|
if (regs->flags & PERF_EFLAGS_EXACT)
|
2010-04-09 05:03:20 +08:00
|
|
|
misc |= PERF_RECORD_MISC_EXACT_IP;
|
2010-04-19 13:32:41 +08:00
|
|
|
|
|
|
|
return misc;
|
|
|
|
}
|
2011-11-10 20:57:27 +08:00
|
|
|
|
|
|
|
void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
|
|
|
|
{
|
2023-02-09 04:42:30 +08:00
|
|
|
/* This API doesn't currently support enumerating hybrid PMUs. */
|
|
|
|
if (WARN_ON_ONCE(cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) ||
|
|
|
|
!x86_pmu_initialized()) {
|
2022-06-01 17:45:17 +08:00
|
|
|
memset(cap, 0, sizeof(*cap));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2021-04-12 22:30:46 +08:00
|
|
|
/*
|
2023-02-09 04:42:30 +08:00
|
|
|
* Note, hybrid CPU models get tracked as having hybrid PMUs even when
|
|
|
|
* all E-cores are disabled via BIOS. When E-cores are disabled, the
|
|
|
|
* base PMU holds the correct number of counters for P-cores.
|
2021-04-12 22:30:46 +08:00
|
|
|
*/
|
2023-02-09 04:42:30 +08:00
|
|
|
cap->version = x86_pmu.version;
|
2011-11-10 20:57:27 +08:00
|
|
|
cap->num_counters_gp = x86_pmu.num_counters;
|
|
|
|
cap->num_counters_fixed = x86_pmu.num_counters_fixed;
|
|
|
|
cap->bit_width_gp = x86_pmu.cntval_bits;
|
|
|
|
cap->bit_width_fixed = x86_pmu.cntval_bits;
|
|
|
|
cap->events_mask = (unsigned int)x86_pmu.events_maskl;
|
|
|
|
cap->events_mask_len = x86_pmu.events_mask_len;
|
2022-04-11 18:19:30 +08:00
|
|
|
cap->pebs_ept = x86_pmu.pebs_ept;
|
2011-11-10 20:57:27 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
|
2022-05-18 21:25:10 +08:00
|
|
|
|
|
|
|
u64 perf_get_hw_event_config(int hw_event)
|
|
|
|
{
|
|
|
|
int max = x86_pmu.max_events;
|
|
|
|
|
|
|
|
if (hw_event < max)
|
|
|
|
return x86_pmu.event_map(array_index_nospec(hw_event, max));
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(perf_get_hw_event_config);
|