From 44ee63587dce85593c22497140db16f4e5027860 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 17 Feb 2010 10:50:50 +0900 Subject: [PATCH 01/18] percpu: Add __percpu sparse annotations to hw_breakpoint Add __percpu sparse annotations to hw_breakpoint. These annotations are to make sparse consider percpu variables to be in a different address space and warn if accessed without going through percpu accessors. This patch doesn't affect normal builds. In kernel/hw_breakpoint.c, per_cpu(nr_task_bp_pinned, cpu)'s will trigger spurious noderef related warnings from sparse. Changing it to &per_cpu(nr_task_bp_pinned[0], cpu) will work around the problem but deemed to ugly by the maintainer. Leave it alone until better solution can be found. Signed-off-by: Tejun Heo Cc: Stephen Rothwell Cc: K.Prasad LKML-Reference: <4B7B4B7A.9050902@kernel.org> Signed-off-by: Frederic Weisbecker --- include/linux/hw_breakpoint.h | 8 ++++---- kernel/hw_breakpoint.c | 10 +++++----- samples/hw_breakpoint/data_breakpoint.c | 6 +++--- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index 5977b724f7c6..c70d27af03f9 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -66,14 +66,14 @@ register_wide_hw_breakpoint_cpu(struct perf_event_attr *attr, perf_overflow_handler_t triggered, int cpu); -extern struct perf_event ** +extern struct perf_event * __percpu * register_wide_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered); extern int register_perf_hw_breakpoint(struct perf_event *bp); extern int __register_perf_hw_breakpoint(struct perf_event *bp); extern void unregister_hw_breakpoint(struct perf_event *bp); -extern void unregister_wide_hw_breakpoint(struct perf_event **cpu_events); +extern void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events); extern int dbg_reserve_bp_slot(struct perf_event *bp); extern int dbg_release_bp_slot(struct perf_event *bp); @@ -100,7 +100,7 @@ static inline struct perf_event * register_wide_hw_breakpoint_cpu(struct perf_event_attr *attr, perf_overflow_handler_t triggered, int cpu) { return NULL; } -static inline struct perf_event ** +static inline struct perf_event * __percpu * register_wide_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered) { return NULL; } static inline int @@ -109,7 +109,7 @@ static inline int __register_perf_hw_breakpoint(struct perf_event *bp) { return -ENOSYS; } static inline void unregister_hw_breakpoint(struct perf_event *bp) { } static inline void -unregister_wide_hw_breakpoint(struct perf_event **cpu_events) { } +unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) { } static inline int reserve_bp_slot(struct perf_event *bp) {return -ENOSYS; } static inline void release_bp_slot(struct perf_event *bp) { } diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 967e66143e11..6542eacb3fa5 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -413,17 +413,17 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); * * @return a set of per_cpu pointers to perf events */ -struct perf_event ** +struct perf_event * __percpu * register_wide_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered) { - struct perf_event **cpu_events, **pevent, *bp; + struct perf_event * __percpu *cpu_events, **pevent, *bp; long err; int cpu; cpu_events = alloc_percpu(typeof(*cpu_events)); if (!cpu_events) - return ERR_PTR(-ENOMEM); + return (void __percpu __force *)ERR_PTR(-ENOMEM); get_online_cpus(); for_each_online_cpu(cpu) { @@ -451,7 +451,7 @@ fail: put_online_cpus(); free_percpu(cpu_events); - return ERR_PTR(err); + return (void __percpu __force *)ERR_PTR(err); } EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); @@ -459,7 +459,7 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel * @cpu_events: the per cpu set of events to unregister */ -void unregister_wide_hw_breakpoint(struct perf_event **cpu_events) +void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) { int cpu; struct perf_event **pevent; diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c index c69cbe9b2426..bd0f337afcab 100644 --- a/samples/hw_breakpoint/data_breakpoint.c +++ b/samples/hw_breakpoint/data_breakpoint.c @@ -34,7 +34,7 @@ #include #include -struct perf_event **sample_hbp; +struct perf_event * __percpu *sample_hbp; static char ksym_name[KSYM_NAME_LEN] = "pid_max"; module_param_string(ksym, ksym_name, KSYM_NAME_LEN, S_IRUGO); @@ -61,8 +61,8 @@ static int __init hw_break_module_init(void) attr.bp_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; sample_hbp = register_wide_hw_breakpoint(&attr, sample_hbp_handler); - if (IS_ERR(sample_hbp)) { - ret = PTR_ERR(sample_hbp); + if (IS_ERR((void __force *)sample_hbp)) { + ret = PTR_ERR((void __force *)sample_hbp); goto fail; } From 84c6f88fc8265d7a712d7d6ed8fc1a878dfc84d1 Mon Sep 17 00:00:00 2001 From: Hitoshi Mitake Date: Thu, 4 Feb 2010 16:08:15 +0900 Subject: [PATCH 02/18] perf lock: Fix and add misc documentally things I've forgot to add 'perf lock' line to command-list.txt, so users of perf could not find perf lock when they type 'perf'. Fixing command-list.txt requires document (tools/perf/Documentation/perf-lock.txt). But perf lock is too much "under construction" to write a stable document, so this is something like pseudo document for now. And I wrote description of perf lock at help section of CONFIG_LOCK_STAT, this will navigate users of lock trace events. Signed-off-by: Hitoshi Mitake Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: <1265267295-8388-1-git-send-email-mitake@dcl.info.waseda.ac.jp> Signed-off-by: Frederic Weisbecker --- lib/Kconfig.debug | 6 ++++++ tools/perf/Documentation/perf-lock.txt | 29 ++++++++++++++++++++++++++ tools/perf/command-list.txt | 1 + 3 files changed, 36 insertions(+) create mode 100644 tools/perf/Documentation/perf-lock.txt diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 25c3ed594c54..65f964e7fe78 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -520,6 +520,12 @@ config LOCK_STAT For more details, see Documentation/lockstat.txt + You can analyze lock events with "perf lock", subcommand of perf. + If you want to use "perf lock", you need to turn on CONFIG_EVENT_TRACING. + + CONFIG_LOCK_STAT defines "contended" and "acquired" lock events. + (CONFIG_LOCKDEP defines "acquire" and "release" events.) + config DEBUG_LOCKDEP bool "Lock dependency engine debugging" depends on DEBUG_KERNEL && LOCKDEP diff --git a/tools/perf/Documentation/perf-lock.txt b/tools/perf/Documentation/perf-lock.txt new file mode 100644 index 000000000000..b317102138c8 --- /dev/null +++ b/tools/perf/Documentation/perf-lock.txt @@ -0,0 +1,29 @@ +perf-lock(1) +============ + +NAME +---- +perf-lock - Analyze lock events + +SYNOPSIS +-------- +[verse] +'perf lock' {record|report|trace} + +DESCRIPTION +----------- +You can analyze various lock behaviours +and statistics with this 'perf lock' command. + + 'perf lock record ' records lock events + between start and end . And this command + produces the file "perf.data" which contains tracing + results of lock events. + + 'perf lock trace' shows raw lock events. + + 'perf lock report' reports statistical data. + +SEE ALSO +-------- +linkperf:perf[1] diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt index 9afcff2e3ae5..db6ee94d4a8e 100644 --- a/tools/perf/command-list.txt +++ b/tools/perf/command-list.txt @@ -18,3 +18,4 @@ perf-top mainporcelain common perf-trace mainporcelain common perf-probe mainporcelain common perf-kmem mainporcelain common +perf-lock mainporcelain common From b67577dfb45580c498bfdb1bc76c00c3b2ad6310 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 3 Feb 2010 09:09:33 +0100 Subject: [PATCH 03/18] perf lock: Drop the buffers multiplexing dependency We need to deal with time ordered events to build a correct state machine of lock events. This is why we multiplex the lock events buffers. But the ordering is done from the kernel, on the tracing fast path, leading to high contention between cpus. Without multiplexing, the events appears in a weak order. If we have four events, each split per cpu, perf record will read the events buffers in the following order: [ CPU0 ev0, CPU0 ev1, CPU0 ev3, CPU0 ev4, CPU1 ev0, CPU1 ev0....] To handle a post processing reordering, we could just read and sort the whole in memory, but it just doesn't scale with high amounts of events: lock events can fill huge amounts in few times. Basically we need to sort in memory and find a "grace period" point when we know that a given slice of previously sorted events can be committed for post-processing, so that we can unload the memory usage step by step and keep a scalable sorting list. There is no strong rules about how to define such "grace period". What does this patch is: We define a FLUSH_PERIOD value that defines a grace period in seconds. We want to have a slice of events covering 2 * FLUSH_PERIOD in our sorted list. If FLUSH_PERIOD is big enough, it ensures every events that occured in the first half of the timeslice have all been buffered and there are none remaining and there won't be further to put inside this first timeslice. Then once we reach the 2 * FLUSH_PERIOD timeslice, we flush the first half to be gentle with the memory (the second half can still get new events in the middle, so wait another period to flush it) FLUSH_PERIOD is defined to 5 seconds. Say the first event started on time t0. We can safely assume that at the time we are processing events of t0 + 10 seconds, ther won't be anymore events to read from perf.data that occured between t0 and t0 + 5 seconds. Hence we can safely flush the first half. To point out funky bugs, we have a guardian that checks a new event timestamp is not below the last event's timestamp flushed and that displays a warning in this case. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Paul Mackerras Cc: Hitoshi Mitake Cc: Li Zefan Cc: Lai Jiangshan Cc: Masami Hiramatsu Cc: Jens Axboe --- tools/perf/builtin-lock.c | 148 +++++++++++++++++++++++++++++++++++++- 1 file changed, 146 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index fb9ab2ad3f92..e12c844df1e2 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -460,6 +460,150 @@ process_raw_event(void *data, int cpu, process_lock_release_event(data, event, cpu, timestamp, thread); } +struct raw_event_queue { + u64 timestamp; + int cpu; + void *data; + struct thread *thread; + struct list_head list; +}; + +static LIST_HEAD(raw_event_head); + +#define FLUSH_PERIOD (5 * NSEC_PER_SEC) + +static u64 flush_limit = ULLONG_MAX; +static u64 last_flush = 0; +struct raw_event_queue *last_inserted; + +static void flush_raw_event_queue(u64 limit) +{ + struct raw_event_queue *tmp, *iter; + + list_for_each_entry_safe(iter, tmp, &raw_event_head, list) { + if (iter->timestamp > limit) + return; + + if (iter == last_inserted) + last_inserted = NULL; + + process_raw_event(iter->data, iter->cpu, iter->timestamp, + iter->thread); + + last_flush = iter->timestamp; + list_del(&iter->list); + free(iter->data); + free(iter); + } +} + +static void __queue_raw_event_end(struct raw_event_queue *new) +{ + struct raw_event_queue *iter; + + list_for_each_entry_reverse(iter, &raw_event_head, list) { + if (iter->timestamp < new->timestamp) { + list_add(&new->list, &iter->list); + return; + } + } + + list_add(&new->list, &raw_event_head); +} + +static void __queue_raw_event_before(struct raw_event_queue *new, + struct raw_event_queue *iter) +{ + list_for_each_entry_continue_reverse(iter, &raw_event_head, list) { + if (iter->timestamp < new->timestamp) { + list_add(&new->list, &iter->list); + return; + } + } + + list_add(&new->list, &raw_event_head); +} + +static void __queue_raw_event_after(struct raw_event_queue *new, + struct raw_event_queue *iter) +{ + list_for_each_entry_continue(iter, &raw_event_head, list) { + if (iter->timestamp > new->timestamp) { + list_add_tail(&new->list, &iter->list); + return; + } + } + list_add_tail(&new->list, &raw_event_head); +} + +/* The queue is ordered by time */ +static void __queue_raw_event(struct raw_event_queue *new) +{ + if (!last_inserted) { + __queue_raw_event_end(new); + return; + } + + /* + * Most of the time the current event has a timestamp + * very close to the last event inserted, unless we just switched + * to another event buffer. Having a sorting based on a list and + * on the last inserted event that is close to the current one is + * probably more efficient than an rbtree based sorting. + */ + if (last_inserted->timestamp >= new->timestamp) + __queue_raw_event_before(new, last_inserted); + else + __queue_raw_event_after(new, last_inserted); +} + +static void queue_raw_event(void *data, int raw_size, int cpu, + u64 timestamp, struct thread *thread) +{ + struct raw_event_queue *new; + + if (flush_limit == ULLONG_MAX) + flush_limit = timestamp + FLUSH_PERIOD; + + if (timestamp < last_flush) { + printf("Warning: Timestamp below last timeslice flush\n"); + return; + } + + new = malloc(sizeof(*new)); + if (!new) + die("Not enough memory\n"); + + new->timestamp = timestamp; + new->cpu = cpu; + new->thread = thread; + + new->data = malloc(raw_size); + if (!new->data) + die("Not enough memory\n"); + + memcpy(new->data, data, raw_size); + + __queue_raw_event(new); + last_inserted = new; + + /* + * We want to have a slice of events covering 2 * FLUSH_PERIOD + * If FLUSH_PERIOD is big enough, it ensures every events that occured + * in the first half of the timeslice have all been buffered and there + * are none remaining (we need that because of the weakly ordered + * event recording we have). Then once we reach the 2 * FLUSH_PERIOD + * timeslice, we flush the first half to be gentle with the memory + * (the second half can still get new events in the middle, so wait + * another period to flush it) + */ + if (new->timestamp > flush_limit && + new->timestamp - flush_limit > FLUSH_PERIOD) { + flush_limit += FLUSH_PERIOD; + flush_raw_event_queue(flush_limit); + } +} + static int process_sample_event(event_t *event, struct perf_session *session) { struct thread *thread; @@ -480,7 +624,7 @@ static int process_sample_event(event_t *event, struct perf_session *session) if (profile_cpu != -1 && profile_cpu != (int) data.cpu) return 0; - process_raw_event(data.raw_data, data.cpu, data.time, thread); + queue_raw_event(data.raw_data, data.raw_size, data.cpu, data.time, thread); return 0; } @@ -576,6 +720,7 @@ static void __cmd_report(void) setup_pager(); select_key(); read_events(); + flush_raw_event_queue(ULLONG_MAX); sort_result(); print_result(); } @@ -608,7 +753,6 @@ static const char *record_args[] = { "record", "-a", "-R", - "-M", "-f", "-m", "1024", "-c", "1", From dd8b1cf681eab40bc5afb67bdd06b2ca341f5669 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 27 Feb 2010 17:10:39 +0100 Subject: [PATCH 04/18] perf: Remove pointless breakpoint union Remove pointless union in the breakpoint field of hw_perf_event. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras --- include/linux/perf_event.h | 5 ++--- lib/Kconfig.debug | 8 +++++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 7b18b4fd5df7..04f06b4be297 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -487,9 +487,8 @@ struct hw_perf_event { struct hrtimer hrtimer; }; #ifdef CONFIG_HAVE_HW_BREAKPOINT - union { /* breakpoint */ - struct arch_hw_breakpoint info; - }; + /* breakpoint */ + struct arch_hw_breakpoint info; #endif }; atomic64_t prev_count; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 65f964e7fe78..4dc24cc13f5c 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -520,11 +520,13 @@ config LOCK_STAT For more details, see Documentation/lockstat.txt - You can analyze lock events with "perf lock", subcommand of perf. - If you want to use "perf lock", you need to turn on CONFIG_EVENT_TRACING. + This also enables lock events required by "perf lock", + subcommand of perf. + If you want to use "perf lock", you also need to turn on + CONFIG_EVENT_TRACING. CONFIG_LOCK_STAT defines "contended" and "acquired" lock events. - (CONFIG_LOCKDEP defines "acquire" and "release" events.) + (CONFIG_LOCKDEP defines "acquire" and "release" events.) config DEBUG_LOCKDEP bool "Lock dependency engine debugging" From 3d083407a16698de86b42aee0da2ffb280b5cb7e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 27 Feb 2010 17:24:15 +0100 Subject: [PATCH 05/18] x86/hw-breakpoints: Remove the name field Remove the name field from the arch_hw_breakpoint. We never deal with target symbols in the arch level, neither do we need to ever store it. It's a legacy for the previous version of the x86 breakpoint backend. Let's remove it. Signed-off-by: Frederic Weisbecker Cc: K.Prasad Cc: Linus Torvalds --- arch/x86/include/asm/hw_breakpoint.h | 1 - arch/x86/kernel/hw_breakpoint.c | 7 ------- 2 files changed, 8 deletions(-) diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h index 0675a7c4c20e..2a1bd8f4f23a 100644 --- a/arch/x86/include/asm/hw_breakpoint.h +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -10,7 +10,6 @@ * (display/resolving) */ struct arch_hw_breakpoint { - char *name; /* Contains name of the symbol to set bkpt */ unsigned long address; u8 len; u8 type; diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index dca2802c666f..41e08dff0161 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -343,13 +343,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp, return ret; } - /* - * For kernel-addresses, either the address or symbol name can be - * specified. - */ - if (info->name) - info->address = (unsigned long) - kallsyms_lookup_name(info->name); /* * Check that the low-order bits of the address are appropriate * for the alignment implied by len. From 1e259e0a9982078896f3404240096cbea01daca4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 28 Feb 2010 20:51:15 +0100 Subject: [PATCH 06/18] hw-breakpoints: Remove stub unthrottle callback We support event unthrottling in breakpoint events. It means that if we have more than sysctl_perf_event_sample_rate/HZ, perf will throttle, ignoring subsequent events until the next tick. So if ptrace exceeds this max rate, it will omit events, which breaks the ptrace determinism that is supposed to report every triggered breakpoints. This is likely to happen if we set sysctl_perf_event_sample_rate to 1. This patch removes support for unthrottling in breakpoint events to break throttling and restore ptrace determinism. Signed-off-by: Frederic Weisbecker Cc: 2.6.33.x Cc: Peter Zijlstra Cc: K.Prasad Cc: Paul Mackerras --- arch/x86/kernel/hw_breakpoint.c | 5 ----- kernel/hw_breakpoint.c | 1 - 2 files changed, 6 deletions(-) diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index bb6006e3e295..1e8ceadc0d6a 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -531,8 +531,3 @@ void hw_breakpoint_pmu_read(struct perf_event *bp) { /* TODO */ } - -void hw_breakpoint_pmu_unthrottle(struct perf_event *bp) -{ - /* TODO */ -} diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 967e66143e11..4d99512ee149 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -489,5 +489,4 @@ struct pmu perf_ops_bp = { .enable = arch_install_hw_breakpoint, .disable = arch_uninstall_hw_breakpoint, .read = hw_breakpoint_pmu_read, - .unthrottle = hw_breakpoint_pmu_unthrottle }; From 1d6040f17d12a65b9f7ab4cb9fd6d721206b79ec Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 25 Feb 2010 19:40:46 +0100 Subject: [PATCH 07/18] perf, x86: make IBS macros available in perf_event.h This patch moves code from oprofile to perf_event.h to make it also available for usage by perf. Signed-off-by: Robert Richter --- arch/x86/include/asm/perf_event.h | 10 ++++++++++ arch/x86/oprofile/op_model_amd.c | 11 ----------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index befd172c82ad..4933ccde96c4 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -117,6 +117,16 @@ union cpuid10_edx { */ #define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) +/* IbsFetchCtl bits/masks */ +#define IBS_FETCH_RAND_EN (1ULL<<57) +#define IBS_FETCH_VAL (1ULL<<49) +#define IBS_FETCH_ENABLE (1ULL<<48) +#define IBS_FETCH_CNT_MASK 0xFFFF0000ULL + +/* IbsOpCtl bits */ +#define IBS_OP_CNT_CTL (1ULL<<19) +#define IBS_OP_VAL (1ULL<<18) +#define IBS_OP_ENABLE (1ULL<<17) #ifdef CONFIG_PERF_EVENTS extern void init_hw_perf_events(void); diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 6a58256dce9f..c67174917305 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -46,17 +46,6 @@ static unsigned long reset_value[NUM_VIRT_COUNTERS]; -/* IbsFetchCtl bits/masks */ -#define IBS_FETCH_RAND_EN (1ULL<<57) -#define IBS_FETCH_VAL (1ULL<<49) -#define IBS_FETCH_ENABLE (1ULL<<48) -#define IBS_FETCH_CNT_MASK 0xFFFF0000ULL - -/* IbsOpCtl bits */ -#define IBS_OP_CNT_CTL (1ULL<<19) -#define IBS_OP_VAL (1ULL<<18) -#define IBS_OP_ENABLE (1ULL<<17) - #define IBS_FETCH_SIZE 6 #define IBS_OP_SIZE 12 From a163b1099dc7016704043c7fc572ae42519f08f7 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 25 Feb 2010 19:43:07 +0100 Subject: [PATCH 08/18] perf, x86: add some IBS macros to perf_event.h Signed-off-by: Robert Richter --- arch/x86/include/asm/perf_event.h | 4 +++- arch/x86/oprofile/op_model_amd.c | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 4933ccde96c4..c7f60e1297ab 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -121,12 +121,14 @@ union cpuid10_edx { #define IBS_FETCH_RAND_EN (1ULL<<57) #define IBS_FETCH_VAL (1ULL<<49) #define IBS_FETCH_ENABLE (1ULL<<48) -#define IBS_FETCH_CNT_MASK 0xFFFF0000ULL +#define IBS_FETCH_CNT 0xFFFF0000ULL +#define IBS_FETCH_MAX_CNT 0x0000FFFFULL /* IbsOpCtl bits */ #define IBS_OP_CNT_CTL (1ULL<<19) #define IBS_OP_VAL (1ULL<<18) #define IBS_OP_ENABLE (1ULL<<17) +#define IBS_OP_MAX_CNT 0x0000FFFFULL #ifdef CONFIG_PERF_EVENTS extern void init_hw_perf_events(void); diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index c67174917305..8ddb9fa9c1b2 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -279,7 +279,7 @@ op_amd_handle_ibs(struct pt_regs * const regs, oprofile_write_commit(&entry); /* reenable the IRQ */ - ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT_MASK); + ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT); ctl |= IBS_FETCH_ENABLE; wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl); } @@ -319,7 +319,7 @@ static inline void op_amd_start_ibs(void) return; if (ibs_config.fetch_enabled) { - val = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; + val = (ibs_config.max_cnt_fetch >> 4) & IBS_FETCH_MAX_CNT; val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0; val |= IBS_FETCH_ENABLE; wrmsrl(MSR_AMD64_IBSFETCHCTL, val); @@ -341,7 +341,7 @@ static inline void op_amd_start_ibs(void) * avoid underflows. */ ibs_op_ctl = min(ibs_op_ctl + IBS_RANDOM_MAXCNT_OFFSET, - 0xFFFFULL); + IBS_OP_MAX_CNT); } if (ibs_caps & IBS_CAPS_OPCNT && ibs_config.dispatched_ops) ibs_op_ctl |= IBS_OP_CNT_CTL; From bb1165d6882f423f90fc7007a88c6c993b7c2ac4 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 1 Mar 2010 14:21:23 +0100 Subject: [PATCH 09/18] perf, x86: rename macro in ARCH_PERFMON_EVENTSEL_ENABLE For consistency reasons this patch renames ARCH_PERFMON_EVENTSEL0_ENABLE to ARCH_PERFMON_EVENTSEL_ENABLE. The following is performed: $ sed -i -e s/ARCH_PERFMON_EVENTSEL0_ENABLE/ARCH_PERFMON_EVENTSEL_ENABLE/g \ arch/x86/include/asm/perf_event.h arch/x86/kernel/cpu/perf_event.c \ arch/x86/kernel/cpu/perf_event_p6.c \ arch/x86/kernel/cpu/perfctr-watchdog.c \ arch/x86/oprofile/op_model_amd.c arch/x86/oprofile/op_model_ppro.c Signed-off-by: Robert Richter --- arch/x86/include/asm/perf_event.h | 2 +- arch/x86/kernel/cpu/perf_event.c | 8 ++++---- arch/x86/kernel/cpu/perf_event_p6.c | 8 ++++---- arch/x86/kernel/cpu/perfctr-watchdog.c | 2 +- arch/x86/oprofile/op_model_amd.c | 6 +++--- arch/x86/oprofile/op_model_ppro.c | 6 +++--- 6 files changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index c7f60e1297ab..80e693684f18 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -18,7 +18,7 @@ #define MSR_ARCH_PERFMON_EVENTSEL0 0x186 #define MSR_ARCH_PERFMON_EVENTSEL1 0x187 -#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) +#define ARCH_PERFMON_EVENTSEL_ENABLE (1 << 22) #define ARCH_PERFMON_EVENTSEL_ANY (1 << 21) #define ARCH_PERFMON_EVENTSEL_INT (1 << 20) #define ARCH_PERFMON_EVENTSEL_OS (1 << 17) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 641ccb9dddbc..6531b4bdb22d 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -553,9 +553,9 @@ static void x86_pmu_disable_all(void) if (!test_bit(idx, cpuc->active_mask)) continue; rdmsrl(x86_pmu.eventsel + idx, val); - if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) + if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) continue; - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(x86_pmu.eventsel + idx, val); } } @@ -590,7 +590,7 @@ static void x86_pmu_enable_all(void) continue; val = event->hw.config; - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(x86_pmu.eventsel + idx, val); } } @@ -853,7 +853,7 @@ void hw_perf_enable(void) static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) { (void)checking_wrmsrl(hwc->config_base + idx, - hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); + hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE); } static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx) diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index 1ca5ba078afd..a4e67b99d91c 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -62,7 +62,7 @@ static void p6_pmu_disable_all(void) /* p6 only has one enable register */ rdmsrl(MSR_P6_EVNTSEL0, val); - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(MSR_P6_EVNTSEL0, val); } @@ -72,7 +72,7 @@ static void p6_pmu_enable_all(void) /* p6 only has one enable register */ rdmsrl(MSR_P6_EVNTSEL0, val); - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(MSR_P6_EVNTSEL0, val); } @@ -83,7 +83,7 @@ p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) u64 val = P6_NOP_EVENT; if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; (void)checking_wrmsrl(hwc->config_base + idx, val); } @@ -95,7 +95,7 @@ static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx) val = hwc->config; if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; (void)checking_wrmsrl(hwc->config_base + idx, val); } diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 74f4e85a5727..fb329e9f8494 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -680,7 +680,7 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz) cpu_nmi_set_wd_enabled(); apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; + evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE; wrmsr(evntsel_msr, evntsel, 0); intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); return 1; diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 8ddb9fa9c1b2..090cbbec7dbd 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -171,7 +171,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, continue; } rdmsrl(msrs->controls[i].addr, val); - if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) + if (val & ARCH_PERFMON_EVENTSEL_ENABLE) op_x86_warn_in_use(i); val &= model->reserved; wrmsrl(msrs->controls[i].addr, val); @@ -398,7 +398,7 @@ static void op_amd_start(struct op_msrs const * const msrs) if (!reset_value[op_x86_phys_to_virt(i)]) continue; rdmsrl(msrs->controls[i].addr, val); - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(msrs->controls[i].addr, val); } @@ -418,7 +418,7 @@ static void op_amd_stop(struct op_msrs const * const msrs) if (!reset_value[op_x86_phys_to_virt(i)]) continue; rdmsrl(msrs->controls[i].addr, val); - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(msrs->controls[i].addr, val); } diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 5d1727ba409e..2bf90fafa7b5 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -88,7 +88,7 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model, continue; } rdmsrl(msrs->controls[i].addr, val); - if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) + if (val & ARCH_PERFMON_EVENTSEL_ENABLE) op_x86_warn_in_use(i); val &= model->reserved; wrmsrl(msrs->controls[i].addr, val); @@ -166,7 +166,7 @@ static void ppro_start(struct op_msrs const * const msrs) for (i = 0; i < num_counters; ++i) { if (reset_value[i]) { rdmsrl(msrs->controls[i].addr, val); - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(msrs->controls[i].addr, val); } } @@ -184,7 +184,7 @@ static void ppro_stop(struct op_msrs const * const msrs) if (!reset_value[i]) continue; rdmsrl(msrs->controls[i].addr, val); - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(msrs->controls[i].addr, val); } } From 320ebf09cbb6d01954c9a060266aa8e0d27f4638 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Mar 2010 12:35:37 +0100 Subject: [PATCH 10/18] perf, x86: Restrict the ANY flag The ANY flag can show SMT data of another task (like 'top'), so we want to disable it when system-wide profiling is disabled. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 3 +++ include/linux/perf_event.h | 15 +++++++++++++++ kernel/perf_event.c | 15 --------------- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 6531b4bdb22d..aab2e1ce9dee 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -503,6 +503,9 @@ static int __hw_perf_event_init(struct perf_event *event) */ if (attr->type == PERF_TYPE_RAW) { hwc->config |= x86_pmu.raw_event(attr->config); + if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) && + perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) + return -EACCES; return 0; } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 04f06b4be297..90e0521b1690 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -857,6 +857,21 @@ extern int sysctl_perf_event_paranoid; extern int sysctl_perf_event_mlock; extern int sysctl_perf_event_sample_rate; +static inline bool perf_paranoid_tracepoint_raw(void) +{ + return sysctl_perf_event_paranoid > -1; +} + +static inline bool perf_paranoid_cpu(void) +{ + return sysctl_perf_event_paranoid > 0; +} + +static inline bool perf_paranoid_kernel(void) +{ + return sysctl_perf_event_paranoid > 1; +} + extern void perf_event_init(void); extern void perf_tp_event(int event_id, u64 addr, u64 count, void *record, int entry_size); extern void perf_bp_event(struct perf_event *event, void *data); diff --git a/kernel/perf_event.c b/kernel/perf_event.c index a661e7991865..482d5e1d3764 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -56,21 +56,6 @@ static atomic_t nr_task_events __read_mostly; */ int sysctl_perf_event_paranoid __read_mostly = 1; -static inline bool perf_paranoid_tracepoint_raw(void) -{ - return sysctl_perf_event_paranoid > -1; -} - -static inline bool perf_paranoid_cpu(void) -{ - return sysctl_perf_event_paranoid > 0; -} - -static inline bool perf_paranoid_kernel(void) -{ - return sysctl_perf_event_paranoid > 1; -} - int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ /* From b622d644c7d61a5cb95b74e7b143c263bed21f0a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 1 Feb 2010 15:36:30 +0100 Subject: [PATCH 11/18] perf_events, x86: Fixup fixed counter constraints Patch 1da53e0230 ("perf_events, x86: Improve x86 event scheduling") lost us one of the fixed purpose counters and then ed8777fc13 ("perf_events, x86: Fix event constraint masks") broke it even further. Widen the fixed event mask to event+umask and specify the full config for each of the 3 fixed purpose counters. Then let the init code fill out the placement for the GP regs based on the cpuid info. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event.h | 2 +- arch/x86/kernel/cpu/perf_event.c | 25 +++++++++++++++------ arch/x86/kernel/cpu/perf_event_intel.c | 31 +++++++++++++++++--------- 3 files changed, 40 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 80e693684f18..db6109a885a7 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -50,7 +50,7 @@ INTEL_ARCH_INV_MASK| \ INTEL_ARCH_EDGE_MASK|\ INTEL_ARCH_UNIT_MASK|\ - INTEL_ARCH_EVTSEL_MASK) + INTEL_ARCH_EVENT_MASK) #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index aab2e1ce9dee..bfc43fa208bc 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -73,10 +73,10 @@ struct debug_store { struct event_constraint { union { unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - u64 idxmsk64[1]; + u64 idxmsk64; }; - int code; - int cmask; + u64 code; + u64 cmask; int weight; }; @@ -103,7 +103,7 @@ struct cpu_hw_events { }; #define __EVENT_CONSTRAINT(c, n, m, w) {\ - { .idxmsk64[0] = (n) }, \ + { .idxmsk64 = (n) }, \ .code = (c), \ .cmask = (m), \ .weight = (w), \ @@ -116,7 +116,7 @@ struct cpu_hw_events { EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK) #define FIXED_EVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, n, INTEL_ARCH_FIXED_MASK) + EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK) #define EVENT_CONSTRAINT_END \ EVENT_CONSTRAINT(0, 0, 0) @@ -615,8 +615,8 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) bitmap_zero(used_mask, X86_PMC_IDX_MAX); for (i = 0; i < n; i++) { - constraints[i] = - x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); + c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); + constraints[i] = c; } /* @@ -1350,6 +1350,7 @@ static void __init pmu_check_apic(void) void __init init_hw_perf_events(void) { + struct event_constraint *c; int err; pr_info("Performance Events: "); @@ -1398,6 +1399,16 @@ void __init init_hw_perf_events(void) __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1, 0, x86_pmu.num_events); + if (x86_pmu.event_constraints) { + for_each_event_constraint(c, x86_pmu.event_constraints) { + if (c->cmask != INTEL_ARCH_FIXED_MASK) + continue; + + c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1; + c->weight += x86_pmu.num_events; + } + } + pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.event_bits); pr_info("... generic registers: %d\n", x86_pmu.num_events); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index cf6590cf4a5f..4fbdfe5708d9 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1,7 +1,7 @@ #ifdef CONFIG_CPU_SUP_INTEL /* - * Intel PerfMon v3. Used on Core2 and later. + * Intel PerfMon, used on Core and later. */ static const u64 intel_perfmon_event_map[] = { @@ -27,8 +27,14 @@ static struct event_constraint intel_core_event_constraints[] = static struct event_constraint intel_core2_event_constraints[] = { - FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ - FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + /* + * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event + * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed + * ratio between these counters. + */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ @@ -37,14 +43,16 @@ static struct event_constraint intel_core2_event_constraints[] = INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */ INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */ + INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */ INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */ EVENT_CONSTRAINT_END }; static struct event_constraint intel_nehalem_event_constraints[] = { - FIXED_EVENT_CONSTRAINT(0xc0, (0xf|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ - FIXED_EVENT_CONSTRAINT(0x3c, (0xf|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ @@ -58,8 +66,9 @@ static struct event_constraint intel_nehalem_event_constraints[] = static struct event_constraint intel_westmere_event_constraints[] = { - FIXED_EVENT_CONSTRAINT(0xc0, (0xf|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ - FIXED_EVENT_CONSTRAINT(0x3c, (0xf|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ @@ -68,8 +77,9 @@ static struct event_constraint intel_westmere_event_constraints[] = static struct event_constraint intel_gen_event_constraints[] = { - FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ - FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ EVENT_CONSTRAINT_END }; @@ -935,7 +945,7 @@ static __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_nehalem_event_constraints; pr_cont("Nehalem/Corei7 events, "); break; - case 28: + case 28: /* Atom */ memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -951,6 +961,7 @@ static __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_westmere_event_constraints; pr_cont("Westmere events, "); break; + default: /* * default constraints for v2 and up From 6630125419ef37ff8781713c5e9d416f2a4ba357 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 2 Mar 2010 15:25:38 -0300 Subject: [PATCH 12/18] perf archive: Don't try to collect files without a build-id MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To avoid these error: [root@doppio ~]# perf archive tar: .build-id/00/00000000000000000000000000000000000000: Cannot stat: No such file or directory tar: .build-id/00/00000000000000000000000000000000000000: Cannot stat: No such file or directory tar: .build-id/00/00000000000000000000000000000000000000: Cannot stat: No such file or directory tar: .build-id/00/00000000000000000000000000000000000000: Cannot stat: No such file or directory tar: Exiting with failure status due to previous errors [root@doppio ~]# More work is needed to support archiving symtabs for binaries without a build-id, perhaps creating a perf.data UUID + adding build-ids for the binaries copied into the cache and then have this perf.data session UUID be a directory with symlinks to the by now calculated build-id of the files inside it. Or just do an extra pass and insert the calculated build-ids in the perf.data header. Reported-by: Ingo Molnar Signed-off-by: Arnaldo Carvalho de Melo Cc: Frédéric Weisbecker Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- tools/perf/perf-archive.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/perf-archive.sh b/tools/perf/perf-archive.sh index 45fbe2f07b15..910468e6e01c 100644 --- a/tools/perf/perf-archive.sh +++ b/tools/perf/perf-archive.sh @@ -9,8 +9,9 @@ fi DEBUGDIR=~/.debug/ BUILDIDS=$(mktemp /tmp/perf-archive-buildids.XXXXXX) +NOBUILDID=0000000000000000000000000000000000000000 -perf buildid-list -i $PERF_DATA --with-hits > $BUILDIDS +perf buildid-list -i $PERF_DATA --with-hits | grep -v "^$NOBUILDID " > $BUILDIDS if [ ! -s $BUILDIDS ] ; then echo "perf archive: no build-ids found" rm -f $BUILDIDS From 29044ad1509ecc229f1d5a31aeed7a8dc61a71c4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 3 Mar 2010 02:25:22 +0100 Subject: [PATCH 13/18] x86/stacktrace: Don't dereference bad frame pointers Callers of a stacktrace might pass bad frame pointers. Those are usually checked for safety in stack walking helpers before any dereferencing, but this is not the case when we need to go through one more frame pointer that backlinks the irq stack to the previous one, as we don't have any reliable address boudaries to compare this frame pointer against. This raises crashes when we record callchains for ftrace events with perf because we don't use the right helpers to capture registers there. We get wrong frame pointers as we call task_pt_regs() even on kernel threads, which is a wrong thing as it gives us the initial state of any kernel threads freshly created. This is even not what we want for user tasks. What we want is a hot snapshot of registers when the ftrace event triggers, not the state before a task entered the kernel. This requires more thoughts to do it correctly though. So first put a guardian to ensure the given frame pointer can be dereferenced to avoid crashes. We'll think about how to fix the callers in a subsequent patch. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Cc: 2.6.33.x Cc: Arnaldo Carvalho de Melo --- arch/x86/kernel/dumpstack_64.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 0ad9597073f5..a6c906c9b193 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -125,9 +125,15 @@ fixup_bp_irq_link(unsigned long bp, unsigned long *stack, { #ifdef CONFIG_FRAME_POINTER struct stack_frame *frame = (struct stack_frame *)bp; + unsigned long next; - if (!in_irq_stack(stack, irq_stack, irq_stack_end)) - return (unsigned long)frame->next_frame; + if (!in_irq_stack(stack, irq_stack, irq_stack_end)) { + if (!probe_kernel_address(&frame->next_frame, next)) + return next; + else + WARN_ONCE(1, "Perf: bad frame pointer = %p in " + "callchain\n", &frame->next_frame); + } #endif return bp; } From da7196e1f986c846ffa8b2ec385223fad38e8518 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 3 Mar 2010 11:47:58 +0000 Subject: [PATCH 14/18] perf, ARM: Modify kuser rmb() call to compile for Thumb-2 The Thumb-2 instruction set does not provide an encoding for sub pc, r0, #95 as present in the rmb() definition used by perf. This results in compilation failure when using a compiler targetting an instruction set other than ARM. This patch redefines rmb() for ARM by casting the address of the kuser helper to a function pointer, therefore getting the compiler to take care of making the call. Patch taken against tip/master. Signed-off-by: Will Deacon Cc: Russell King - ARM Linux Cc: Jamie Iles Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <1267616878-2154-1-git-send-email-will.deacon@arm.com> Signed-off-by: Ingo Molnar --- tools/perf/perf.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/perf/perf.h b/tools/perf/perf.h index 75f941bfba9e..6fb379bc1d1f 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -65,9 +65,7 @@ * Use the __kuser_memory_barrier helper in the CPU helper page. See * arch/arm/kernel/entry-armv.S in the kernel source for details. */ -#define rmb() asm volatile("mov r0, #0xffff0fff; mov lr, pc;" \ - "sub pc, r0, #95" ::: "r0", "lr", "cc", \ - "memory") +#define rmb() ((void(*)(void))0xffff0fa0)() #define cpu_relax() asm volatile("":::"memory") #endif From 10c95f4f41889daaa8130e0bd12209825dbe8d39 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 3 Mar 2010 01:04:32 -0600 Subject: [PATCH 15/18] perf trace/scripting: Remove extraneous header read perf_header__read() is already done in perf_session__open(), so remove it from the script gen case. Signed-off-by: Tom Zanussi Cc: fweisbec@gmail.com Cc: rostedt@goodmis.org LKML-Reference: <1267599873-8193-2-git-send-email-tzanussi@gmail.com> Signed-off-by: Ingo Molnar --- tools/perf/builtin-trace.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 5db687fc13de..b8153db5a9eb 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -608,7 +608,6 @@ int cmd_trace(int argc, const char **argv, const char *prefix __used) return -1; } - perf_header__read(&session->header, input); err = scripting_ops->generate_script("perf-trace"); goto out; } From cf4fee50282312528e1f8adf73b1831d1d6ae389 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 3 Mar 2010 01:04:33 -0600 Subject: [PATCH 16/18] perf trace: Don't use pager if scripting It's useful for paging through raw traces, but just gets in the way when scripting. Signed-off-by: Tom Zanussi Cc: fweisbec@gmail.com Cc: rostedt@goodmis.org LKML-Reference: <1267599873-8193-3-git-send-email-tzanussi@gmail.com> Signed-off-by: Ingo Molnar --- tools/perf/builtin-trace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index b8153db5a9eb..407041d20de0 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -573,7 +573,8 @@ int cmd_trace(int argc, const char **argv, const char *prefix __used) if (symbol__init() < 0) return -1; - setup_pager(); + if (!script_name) + setup_pager(); session = perf_session__new(input_name, O_RDONLY, 0); if (session == NULL) From cfb581bcd4f8c158c6f2b48bf5e232bb9e6855c0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Mar 2010 15:20:50 +0100 Subject: [PATCH 17/18] MAINTAINERS: Add Arnaldo as tools/perf/ co-maintainer Acked-by: Peter Zijlstra Acked-by: Paul Mackerras Acked-by: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 2533fc45a44a..40ed22e02d62 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4221,6 +4221,7 @@ PERFORMANCE EVENTS SUBSYSTEM M: Peter Zijlstra M: Paul Mackerras M: Ingo Molnar +M: Arnaldo Carvalho de Melo S: Supported F: kernel/perf_event.c F: include/linux/perf_event.h From dc1d628a67a8f042e711ea5accc0beedc3ef0092 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Mar 2010 15:55:04 +0100 Subject: [PATCH 18/18] perf: Provide generic perf_sample_data initialization This makes it easier to extend perf_sample_data and fixes a bug on arm and sparc, which failed to set ->raw to NULL, which can cause crashes when combined with PERF_SAMPLE_RAW. It also optimizes PowerPC and tracepoint, because the struct initialization is forced to zero out the whole structure. Signed-off-by: Peter Zijlstra Acked-by: Jean Pihet Reviewed-by: Frederic Weisbecker Acked-by: David S. Miller Cc: Jamie Iles Cc: Paul Mackerras Cc: Stephane Eranian Cc: stable@kernel.org LKML-Reference: <20100304140100.315416040@chello.nl> Signed-off-by: Ingo Molnar --- arch/arm/kernel/perf_event.c | 4 ++-- arch/powerpc/kernel/perf_event.c | 8 ++++---- arch/sparc/kernel/perf_event.c | 2 +- arch/x86/kernel/cpu/perf_event.c | 3 +-- arch/x86/kernel/cpu/perf_event_intel.c | 6 ++---- include/linux/perf_event.h | 7 +++++++ kernel/perf_event.c | 21 ++++++++------------- 7 files changed, 25 insertions(+), 26 deletions(-) diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c index c54ceb3d1f97..3875d99cc40f 100644 --- a/arch/arm/kernel/perf_event.c +++ b/arch/arm/kernel/perf_event.c @@ -965,7 +965,7 @@ armv6pmu_handle_irq(int irq_num, */ armv6_pmcr_write(pmcr); - data.addr = 0; + perf_sample_data_init(&data, 0); cpuc = &__get_cpu_var(cpu_hw_events); for (idx = 0; idx <= armpmu->num_events; ++idx) { @@ -1945,7 +1945,7 @@ static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev) */ regs = get_irq_regs(); - data.addr = 0; + perf_sample_data_init(&data, 0); cpuc = &__get_cpu_var(cpu_hw_events); for (idx = 0; idx <= armpmu->num_events; ++idx) { diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c index b6cf8f1f4d35..5120bd44f69a 100644 --- a/arch/powerpc/kernel/perf_event.c +++ b/arch/powerpc/kernel/perf_event.c @@ -1164,10 +1164,10 @@ static void record_and_restart(struct perf_event *event, unsigned long val, * Finally record data if requested. */ if (record) { - struct perf_sample_data data = { - .addr = ~0ULL, - .period = event->hw.last_period, - }; + struct perf_sample_data data; + + perf_sample_data_init(&data, ~0ULL); + data.period = event->hw.last_period; if (event->attr.sample_type & PERF_SAMPLE_ADDR) perf_get_data_addr(regs, &data.addr); diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 9f2b2bac8b2b..6504208f375f 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -1189,7 +1189,7 @@ static int __kprobes perf_event_nmi_handler(struct notifier_block *self, regs = args->regs; - data.addr = 0; + perf_sample_data_init(&data, 0); cpuc = &__get_cpu_var(cpu_hw_events); diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 97cddbf32936..42aafd11e170 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1097,8 +1097,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) int idx, handled = 0; u64 val; - data.addr = 0; - data.raw = NULL; + perf_sample_data_init(&data, 0); cpuc = &__get_cpu_var(cpu_hw_events); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 73102df8bfc1..44b60c852107 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -590,10 +590,9 @@ static void intel_pmu_drain_bts_buffer(void) ds->bts_index = ds->bts_buffer_base; + perf_sample_data_init(&data, 0); data.period = event->hw.last_period; - data.addr = 0; - data.raw = NULL; regs.ip = 0; /* @@ -742,8 +741,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) int bit, loops; u64 ack, status; - data.addr = 0; - data.raw = NULL; + perf_sample_data_init(&data, 0); cpuc = &__get_cpu_var(cpu_hw_events); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 90e0521b1690..6f8cd7da1a01 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -801,6 +801,13 @@ struct perf_sample_data { struct perf_raw_record *raw; }; +static inline +void perf_sample_data_init(struct perf_sample_data *data, u64 addr) +{ + data->addr = addr; + data->raw = NULL; +} + extern void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, diff --git a/kernel/perf_event.c b/kernel/perf_event.c index e68745053013..4393b9e73740 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4108,8 +4108,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi, if (rctx < 0) return; - data.addr = addr; - data.raw = NULL; + perf_sample_data_init(&data, addr); do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); @@ -4154,11 +4153,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) struct perf_event *event; u64 period; - event = container_of(hrtimer, struct perf_event, hw.hrtimer); + event = container_of(hrtimer, struct perf_event, hw.hrtimer); event->pmu->read(event); - data.addr = 0; - data.raw = NULL; + perf_sample_data_init(&data, 0); data.period = event->hw.last_period; regs = get_irq_regs(); /* @@ -4322,17 +4320,15 @@ static const struct pmu perf_ops_task_clock = { void perf_tp_event(int event_id, u64 addr, u64 count, void *record, int entry_size) { + struct pt_regs *regs = get_irq_regs(); + struct perf_sample_data data; struct perf_raw_record raw = { .size = entry_size, .data = record, }; - struct perf_sample_data data = { - .addr = addr, - .raw = &raw, - }; - - struct pt_regs *regs = get_irq_regs(); + perf_sample_data_init(&data, addr); + data.raw = &raw; if (!regs) regs = task_pt_regs(current); @@ -4448,8 +4444,7 @@ void perf_bp_event(struct perf_event *bp, void *data) struct perf_sample_data sample; struct pt_regs *regs = data; - sample.raw = NULL; - sample.addr = bp->attr.bp_addr; + perf_sample_data_init(&sample, bp->attr.bp_addr); if (!perf_exclude_event(bp, regs)) perf_swevent_add(bp, 1, 1, &sample, regs);