perf/x86/intel/lbr: Optimize context switches for the LBR call stack

Context switches with perf LBR call stack context are fairly expensive
because they do a lot of MSR writes. Currently we unconditionally do the
expensive operation when LBR call stack is enabled. It's not necessary
for some common cases, e.g task -> other kernel thread -> same task.
The LBR registers are not changed, hence they don't need to be
rewritten/restored.

Introduce per-CPU variables to track the last LBR call stack context.
If the same context is scheduled in, the rewrite/restore is not
required, with the following two exceptions:

 - The LBR registers may be modified by a normal LBR event, i.e., adding
   a new LBR event or scheduling an existing LBR event. In both cases,
   the LBR registers are reset first. The last LBR call stack information
   is cleared in intel_pmu_lbr_reset(). Restoring the LBR registers is
   required.

 - The LBR registers are initialized to zero in C6.
   If the LBR registers which TOS points is cleared, C6 must be entered
   while swapped out. Restoring the LBR registers is required as well.

These exceptions are not common.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: acme@kernel.org
Cc: eranian@google.com
Link: https://lore.kernel.org/lkml/1528213126-4312-2-git-send-email-kan.liang@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Kan Liang 2018-06-05 08:38:46 -07:00 committed by Ingo Molnar
parent 0592e57b24
commit 8b077e4a69
2 changed files with 27 additions and 1 deletions

View File

@ -216,6 +216,8 @@ static void intel_pmu_lbr_reset_64(void)
void intel_pmu_lbr_reset(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (!x86_pmu.lbr_nr)
return;
@ -223,6 +225,9 @@ void intel_pmu_lbr_reset(void)
intel_pmu_lbr_reset_32();
else
intel_pmu_lbr_reset_64();
cpuc->last_task_ctx = NULL;
cpuc->last_log_id = 0;
}
/*
@ -334,6 +339,7 @@ static inline u64 rdlbr_to(unsigned int idx)
static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
int i;
unsigned lbr_idx, mask;
u64 tos;
@ -344,8 +350,20 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
return;
}
mask = x86_pmu.lbr_nr - 1;
tos = task_ctx->tos;
/*
* Does not restore the LBR registers, if
* - No one else touched them, and
* - Did not enter C6
*/
if ((task_ctx == cpuc->last_task_ctx) &&
(task_ctx->log_id == cpuc->last_log_id) &&
rdlbr_from(tos)) {
task_ctx->lbr_stack_state = LBR_NONE;
return;
}
mask = x86_pmu.lbr_nr - 1;
for (i = 0; i < task_ctx->valid_lbrs; i++) {
lbr_idx = (tos - i) & mask;
wrlbr_from(lbr_idx, task_ctx->lbr_from[i]);
@ -369,6 +387,7 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
unsigned lbr_idx, mask;
u64 tos, from;
int i;
@ -393,6 +412,9 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
task_ctx->valid_lbrs = i;
task_ctx->tos = tos;
task_ctx->lbr_stack_state = LBR_VALID;
cpuc->last_task_ctx = task_ctx;
cpuc->last_log_id = ++task_ctx->log_id;
}
void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)

View File

@ -163,6 +163,7 @@ struct intel_excl_cntrs {
unsigned core_id; /* per-core: core id */
};
struct x86_perf_task_context;
#define MAX_LBR_ENTRIES 32
enum {
@ -214,6 +215,8 @@ struct cpu_hw_events {
struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
struct er_account *lbr_sel;
u64 br_sel;
struct x86_perf_task_context *last_task_ctx;
int last_log_id;
/*
* Intel host/guest exclude bits
@ -651,6 +654,7 @@ struct x86_perf_task_context {
int valid_lbrs;
int lbr_callstack_users;
int lbr_stack_state;
int log_id;
};
#define x86_add_quirk(func_) \