From bc078e4eab65f11bbaeed380593ab8151b30d703 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 2 Mar 2010 16:01:10 +0100 Subject: [PATCH 01/24] oprofile: convert oprofile from timer_hook to hrtimer Oprofile is currently broken on systems running with NOHZ enabled. A maximum of 1 tick is accounted via the timer_hook if a cpu sleeps for a longer period of time. This does bad things to the percentages in the profiler output. To solve this problem convert oprofile to use a restarting hrtimer instead of the timer_hook. Signed-off-by: Martin Schwidefsky Signed-off-by: Robert Richter --- drivers/oprofile/oprof.c | 12 ++++-- drivers/oprofile/oprof.h | 3 +- drivers/oprofile/timer_int.c | 82 +++++++++++++++++++++++++++++++----- 3 files changed, 81 insertions(+), 16 deletions(-) diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c index dc8a0428260d..b336cd9ee7a1 100644 --- a/drivers/oprofile/oprof.c +++ b/drivers/oprofile/oprof.c @@ -253,22 +253,26 @@ static int __init oprofile_init(void) int err; err = oprofile_arch_init(&oprofile_ops); - if (err < 0 || timer) { printk(KERN_INFO "oprofile: using timer interrupt.\n"); - oprofile_timer_init(&oprofile_ops); + err = oprofile_timer_init(&oprofile_ops); + if (err) + goto out_arch; } - err = oprofilefs_register(); if (err) - oprofile_arch_exit(); + goto out_arch; + return 0; +out_arch: + oprofile_arch_exit(); return err; } static void __exit oprofile_exit(void) { + oprofile_timer_exit(); oprofilefs_unregister(); oprofile_arch_exit(); } diff --git a/drivers/oprofile/oprof.h b/drivers/oprofile/oprof.h index cb92f5c98c1a..47e12cb4ee8b 100644 --- a/drivers/oprofile/oprof.h +++ b/drivers/oprofile/oprof.h @@ -34,7 +34,8 @@ struct super_block; struct dentry; void oprofile_create_files(struct super_block *sb, struct dentry *root); -void oprofile_timer_init(struct oprofile_operations *ops); +int oprofile_timer_init(struct oprofile_operations *ops); +void oprofile_timer_exit(void); int oprofile_set_backtrace(unsigned long depth); int oprofile_set_timeout(unsigned long time); diff --git a/drivers/oprofile/timer_int.c b/drivers/oprofile/timer_int.c index 333f915568c7..dc0ae4d14dff 100644 --- a/drivers/oprofile/timer_int.c +++ b/drivers/oprofile/timer_int.c @@ -13,34 +13,94 @@ #include #include #include +#include +#include +#include #include #include "oprof.h" -static int timer_notify(struct pt_regs *regs) +static DEFINE_PER_CPU(struct hrtimer, oprofile_hrtimer); + +static enum hrtimer_restart oprofile_hrtimer_notify(struct hrtimer *hrtimer) { - oprofile_add_sample(regs, 0); + oprofile_add_sample(get_irq_regs(), 0); + hrtimer_forward_now(hrtimer, ns_to_ktime(TICK_NSEC)); + return HRTIMER_RESTART; +} + +static void __oprofile_hrtimer_start(void *unused) +{ + struct hrtimer *hrtimer = &__get_cpu_var(oprofile_hrtimer); + + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer->function = oprofile_hrtimer_notify; + + hrtimer_start(hrtimer, ns_to_ktime(TICK_NSEC), + HRTIMER_MODE_REL_PINNED); +} + +static int oprofile_hrtimer_start(void) +{ + on_each_cpu(__oprofile_hrtimer_start, NULL, 1); return 0; } -static int timer_start(void) +static void __oprofile_hrtimer_stop(int cpu) { - return register_timer_hook(timer_notify); + struct hrtimer *hrtimer = &per_cpu(oprofile_hrtimer, cpu); + + hrtimer_cancel(hrtimer); } - -static void timer_stop(void) +static void oprofile_hrtimer_stop(void) { - unregister_timer_hook(timer_notify); + int cpu; + + for_each_online_cpu(cpu) + __oprofile_hrtimer_stop(cpu); } - -void __init oprofile_timer_init(struct oprofile_operations *ops) +static int __cpuinit oprofile_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) { + long cpu = (long) hcpu; + + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + smp_call_function_single(cpu, __oprofile_hrtimer_start, + NULL, 1); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + __oprofile_hrtimer_stop(cpu); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __refdata oprofile_cpu_notifier = { + .notifier_call = oprofile_cpu_notify, +}; + +int __init oprofile_timer_init(struct oprofile_operations *ops) +{ + int rc; + + rc = register_hotcpu_notifier(&oprofile_cpu_notifier); + if (rc) + return rc; ops->create_files = NULL; ops->setup = NULL; ops->shutdown = NULL; - ops->start = timer_start; - ops->stop = timer_stop; + ops->start = oprofile_hrtimer_start; + ops->stop = oprofile_hrtimer_stop; ops->cpu_type = "timer"; + return 0; +} + +void __exit oprofile_timer_exit(void) +{ + unregister_hotcpu_notifier(&oprofile_cpu_notifier); } From 4bdde044dc36ac7b01f7502394d52619af9d1927 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Mar 2010 10:58:05 +0800 Subject: [PATCH 02/24] tracing: Convert some signal events to DEFINE_TRACE Use DECLARE_EVENT_CLASS to remove duplicate code: text data bss dec hex filename 23639 6084 8 29731 7423 kernel/signal.o.orig 22727 6084 8 28819 7093 kernel/signal.o 2 events are converted: signal_queue_overflow: signal_overflow_fail, signal_lose_info No functional change. Acked-by: Masami Hiramatsu Signed-off-by: Li Zefan LKML-Reference: <4BA97FBD.8070703@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- include/trace/events/signal.h | 52 +++++++++++++++-------------------- 1 file changed, 22 insertions(+), 30 deletions(-) diff --git a/include/trace/events/signal.h b/include/trace/events/signal.h index a510b75ac304..814566c99d29 100644 --- a/include/trace/events/signal.h +++ b/include/trace/events/signal.h @@ -100,18 +100,7 @@ TRACE_EVENT(signal_deliver, __entry->sa_handler, __entry->sa_flags) ); -/** - * signal_overflow_fail - called when signal queue is overflow - * @sig: signal number - * @group: signal to process group or not (bool) - * @info: pointer to struct siginfo - * - * Kernel fails to generate 'sig' signal with 'info' siginfo, because - * siginfo queue is overflow, and the signal is dropped. - * 'group' is not 0 if the signal will be sent to a process group. - * 'sig' is always one of RT signals. - */ -TRACE_EVENT(signal_overflow_fail, +DECLARE_EVENT_CLASS(signal_queue_overflow, TP_PROTO(int sig, int group, struct siginfo *info), @@ -134,6 +123,24 @@ TRACE_EVENT(signal_overflow_fail, __entry->sig, __entry->group, __entry->errno, __entry->code) ); +/** + * signal_overflow_fail - called when signal queue is overflow + * @sig: signal number + * @group: signal to process group or not (bool) + * @info: pointer to struct siginfo + * + * Kernel fails to generate 'sig' signal with 'info' siginfo, because + * siginfo queue is overflow, and the signal is dropped. + * 'group' is not 0 if the signal will be sent to a process group. + * 'sig' is always one of RT signals. + */ +DEFINE_EVENT(signal_queue_overflow, signal_overflow_fail, + + TP_PROTO(int sig, int group, struct siginfo *info), + + TP_ARGS(sig, group, info) +); + /** * signal_lose_info - called when siginfo is lost * @sig: signal number @@ -145,28 +152,13 @@ TRACE_EVENT(signal_overflow_fail, * 'group' is not 0 if the signal will be sent to a process group. * 'sig' is always one of non-RT signals. */ -TRACE_EVENT(signal_lose_info, +DEFINE_EVENT(signal_queue_overflow, signal_lose_info, TP_PROTO(int sig, int group, struct siginfo *info), - TP_ARGS(sig, group, info), - - TP_STRUCT__entry( - __field( int, sig ) - __field( int, group ) - __field( int, errno ) - __field( int, code ) - ), - - TP_fast_assign( - __entry->sig = sig; - __entry->group = group; - TP_STORE_SIGINFO(__entry, info); - ), - - TP_printk("sig=%d group=%d errno=%d code=%d", - __entry->sig, __entry->group, __entry->errno, __entry->code) + TP_ARGS(sig, group, info) ); + #endif /* _TRACE_SIGNAL_H */ /* This part must be outside protection */ From 50354a8a28d0c91695a2d6d25b5a821bfe557a07 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Mar 2010 10:58:24 +0800 Subject: [PATCH 03/24] tracing: Update comments Make some comments consistent with the code. Signed-off-by: Li Zefan LKML-Reference: <4BA97FD0.7090202@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- include/trace/ftrace.h | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index ea6f9d4a20e9..75dd7787fb37 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -154,9 +154,11 @@ * * field = (typeof(field))entry; * - * p = get_cpu_var(ftrace_event_seq); + * p = &get_cpu_var(ftrace_event_seq); * trace_seq_init(p); - * ret = trace_seq_printf(s, "\n"); + * ret = trace_seq_printf(s, "%s: ", ); + * if (ret) + * ret = trace_seq_printf(s, "\n"); * put_cpu(); * if (!ret) * return TRACE_TYPE_PARTIAL_LINE; @@ -450,38 +452,38 @@ perf_trace_disable_##name(struct ftrace_event_call *unused) \ * * static void ftrace_raw_event_(proto) * { + * struct ftrace_data_offsets_ __maybe_unused __data_offsets; * struct ring_buffer_event *event; * struct ftrace_raw_ *entry; <-- defined in stage 1 * struct ring_buffer *buffer; * unsigned long irq_flags; + * int __data_size; * int pc; * * local_save_flags(irq_flags); * pc = preempt_count(); * + * __data_size = ftrace_get_offsets_(&__data_offsets, args); + * * event = trace_current_buffer_lock_reserve(&buffer, * event_.id, - * sizeof(struct ftrace_raw_), + * sizeof(*entry) + __data_size, * irq_flags, pc); * if (!event) * return; * entry = ring_buffer_event_data(event); * - * ; <-- Here we assign the entries by the __field and - * __array macros. + * { ; } <-- Here we assign the entries by the __field and + * __array macros. * - * trace_current_buffer_unlock_commit(buffer, event, irq_flags, pc); + * if (!filter_current_check_discard(buffer, event_call, entry, event)) + * trace_current_buffer_unlock_commit(buffer, + * event, irq_flags, pc); * } * * static int ftrace_raw_reg_event_(struct ftrace_event_call *unused) * { - * int ret; - * - * ret = register_trace_(ftrace_raw_event_); - * if (!ret) - * pr_info("event trace: Could not activate trace point " - * "probe to "); - * return ret; + * return register_trace_(ftrace_raw_event_); * } * * static void ftrace_unreg_event_(struct ftrace_event_call *unused) @@ -493,6 +495,8 @@ perf_trace_disable_##name(struct ftrace_event_call *unused) \ * .trace = ftrace_raw_output_, <-- stage 2 * }; * + * static const char print_fmt_[] = ; + * * static struct ftrace_event_call __used * __attribute__((__aligned__(4))) * __attribute__((section("_ftrace_events"))) event_ = { @@ -501,6 +505,8 @@ perf_trace_disable_##name(struct ftrace_event_call *unused) \ * .raw_init = trace_event_raw_init, * .regfunc = ftrace_reg_event_, * .unregfunc = ftrace_unreg_event_, + * .print_fmt = print_fmt_, + * .define_fields = ftrace_define_fields_, * } * */ @@ -569,7 +575,6 @@ ftrace_raw_event_id_##call(struct ftrace_event_call *event_call, \ return; \ entry = ring_buffer_event_data(event); \ \ - \ tstruct \ \ { assign; } \ From ae832d1e03ac9bf09fb8a07fb37908ab40c7cd0e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Mar 2010 10:57:43 +0800 Subject: [PATCH 04/24] tracing: Remove side effect from module tracepoints that caused a GPF Remove the @refcnt argument, because it has side-effects, and arguments with side-effects are not skipped by the jump over disabled instrumentation and are executed even when the tracepoint is disabled. This was also causing a GPF as found by Randy Dunlap: Subject: 2.6.33 GP fault only when built with tracing LKML-Reference: <4BA2B69D.3000309@oracle.com> Note, the current 2.6.34-rc has a fix for the actual cause of the GPF, but this fixes one of its triggers. Tested-by: Randy Dunlap Acked-by: Mathieu Desnoyers Signed-off-by: Li Zefan LKML-Reference: <4BA97FA7.6040406@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- include/linux/module.h | 6 ++---- include/trace/events/module.h | 14 +++++++------- kernel/module.c | 3 +-- 3 files changed, 10 insertions(+), 13 deletions(-) diff --git a/include/linux/module.h b/include/linux/module.h index 5e869ffd34aa..393ec39b580a 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -460,8 +460,7 @@ static inline void __module_get(struct module *module) if (module) { preempt_disable(); __this_cpu_inc(module->refptr->count); - trace_module_get(module, _THIS_IP_, - __this_cpu_read(module->refptr->count)); + trace_module_get(module, _THIS_IP_); preempt_enable(); } } @@ -475,8 +474,7 @@ static inline int try_module_get(struct module *module) if (likely(module_is_live(module))) { __this_cpu_inc(module->refptr->count); - trace_module_get(module, _THIS_IP_, - __this_cpu_read(module->refptr->count)); + trace_module_get(module, _THIS_IP_); } else ret = 0; diff --git a/include/trace/events/module.h b/include/trace/events/module.h index 4b0f48ba16a6..a585f8135bd9 100644 --- a/include/trace/events/module.h +++ b/include/trace/events/module.h @@ -53,9 +53,9 @@ TRACE_EVENT(module_free, DECLARE_EVENT_CLASS(module_refcnt, - TP_PROTO(struct module *mod, unsigned long ip, int refcnt), + TP_PROTO(struct module *mod, unsigned long ip), - TP_ARGS(mod, ip, refcnt), + TP_ARGS(mod, ip), TP_STRUCT__entry( __field( unsigned long, ip ) @@ -65,7 +65,7 @@ DECLARE_EVENT_CLASS(module_refcnt, TP_fast_assign( __entry->ip = ip; - __entry->refcnt = refcnt; + __entry->refcnt = __this_cpu_read(mod->refptr->count); __assign_str(name, mod->name); ), @@ -75,16 +75,16 @@ DECLARE_EVENT_CLASS(module_refcnt, DEFINE_EVENT(module_refcnt, module_get, - TP_PROTO(struct module *mod, unsigned long ip, int refcnt), + TP_PROTO(struct module *mod, unsigned long ip), - TP_ARGS(mod, ip, refcnt) + TP_ARGS(mod, ip) ); DEFINE_EVENT(module_refcnt, module_put, - TP_PROTO(struct module *mod, unsigned long ip, int refcnt), + TP_PROTO(struct module *mod, unsigned long ip), - TP_ARGS(mod, ip, refcnt) + TP_ARGS(mod, ip) ); TRACE_EVENT(module_request, diff --git a/kernel/module.c b/kernel/module.c index c968d3606dca..21591ad921f3 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -800,8 +800,7 @@ void module_put(struct module *module) preempt_disable(); __this_cpu_dec(module->refptr->count); - trace_module_put(module, _RET_IP_, - __this_cpu_read(module->refptr->count)); + trace_module_put(module, _RET_IP_); /* Maybe they're waiting for us to drop reference? */ if (unlikely(!module_is_live(module))) wake_up_process(module->waiter); From eb0c53771fb2f5f66b0edb3ebce33be4bbf1c285 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 29 Mar 2010 14:25:18 -0400 Subject: [PATCH 05/24] tracing: Fix compile error in module tracepoints when MODULE_UNLOAD not set If modules are configured in the build but unloading of modules is not, then the refcnt is not defined. Place the get/put module tracepoints under CONFIG_MODULE_UNLOAD since it references this field in the module structure. As a side-effect, this patch also reduces the code when MODULE_UNLOAD is not set, because these unused tracepoints are not created. Signed-off-by: Steven Rostedt --- include/trace/events/module.h | 4 ++++ kernel/module.c | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/trace/events/module.h b/include/trace/events/module.h index a585f8135bd9..f07b44a2b240 100644 --- a/include/trace/events/module.h +++ b/include/trace/events/module.h @@ -51,6 +51,9 @@ TRACE_EVENT(module_free, TP_printk("%s", __get_str(name)) ); +#ifdef CONFIG_MODULE_UNLOAD +/* trace_module_get/put are only used if CONFIG_MODULE_UNLOAD is defined */ + DECLARE_EVENT_CLASS(module_refcnt, TP_PROTO(struct module *mod, unsigned long ip), @@ -86,6 +89,7 @@ DEFINE_EVENT(module_refcnt, module_put, TP_ARGS(mod, ip) ); +#endif /* CONFIG_MODULE_UNLOAD */ TRACE_EVENT(module_request, diff --git a/kernel/module.c b/kernel/module.c index 21591ad921f3..d9e237926b69 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -59,8 +59,6 @@ #define CREATE_TRACE_POINTS #include -EXPORT_TRACEPOINT_SYMBOL(module_get); - #if 0 #define DEBUGP printk #else @@ -467,6 +465,9 @@ MODINFO_ATTR(srcversion); static char last_unloaded_module[MODULE_NAME_LEN+1]; #ifdef CONFIG_MODULE_UNLOAD + +EXPORT_TRACEPOINT_SYMBOL(module_get); + /* Init the unload section of the module. */ static void module_unload_init(struct module *mod) { From 66a8cb95ed04025664d1db4e952155ee1dccd048 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 31 Mar 2010 13:21:56 -0400 Subject: [PATCH 06/24] ring-buffer: Add place holder recording of dropped events Currently, when the ring buffer drops events, it does not record the fact that it did so. It does inform the writer that the event was dropped by returning a NULL event, but it does not put in any place holder where the event was dropped. This is not a trivial thing to add because the ring buffer mostly runs in overwrite (flight recorder) mode. That is, when the ring buffer is full, new data will overwrite old data. In a produce/consumer mode, where new data is simply dropped when the ring buffer is full, it is trivial to add the placeholder for dropped events. When there's more room to write new data, then a special event can be added to notify the reader about the dropped events. But in overwrite mode, any new write can overwrite events. A place holder can not be inserted into the ring buffer since there never may be room. A reader could also come in at anytime and miss the placeholder. Luckily, the way the ring buffer works, the read side can find out if events were lost or not, and how many events. Everytime a write takes place, if it overwrites the header page (the next read) it updates a "overrun" variable that keeps track of the number of lost events. When a reader swaps out a page from the ring buffer, it can record this number, perfom the swap, and then check to see if the number changed, and take the diff if it has, which would be the number of events dropped. This can be stored by the reader and returned to callers of the reader. Since the reader page swap will fail if the writer moved the head page since the time the reader page set up the swap, this gives room to record the overruns without worrying about races. If the reader sets up the pages, records the overrun, than performs the swap, if the swap succeeds, then the overrun variable has not been updated since the setup before the swap. For binary readers of the ring buffer, a flag is set in the header of each sub page (sub buffer) of the ring buffer. This flag is embedded in the size field of the data on the sub buffer, in the 31st bit (the size can be 32 or 64 bits depending on the architecture), but only 27 bits needs to be used for the actual size (less actually). We could add a new field in the sub buffer header to also record the number of events dropped since the last read, but this will change the format of the binary ring buffer a bit too much. Perhaps this change can be made if the information on the number of events dropped is considered important enough. Note, the notification of dropped events is only used by consuming reads or peeking at the ring buffer. Iterating over the ring buffer does not keep this information because the necessary data is only available when a page swap is made, and the iterator does not swap out pages. Cc: Robert Richter Cc: Andi Kleen Cc: Li Zefan Cc: Arnaldo Carvalho de Melo Cc: "Luis Claudio R. Goncalves" Cc: Frederic Weisbecker Signed-off-by: Steven Rostedt --- drivers/oprofile/cpu_buffer.c | 4 +- include/linux/ring_buffer.h | 6 ++- kernel/trace/ring_buffer.c | 72 +++++++++++++++++++++++++--- kernel/trace/ring_buffer_benchmark.c | 2 +- kernel/trace/trace.c | 4 +- kernel/trace/trace_functions_graph.c | 5 +- kernel/trace/trace_selftest.c | 2 +- 7 files changed, 79 insertions(+), 16 deletions(-) diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c index 166b67ea622f..7581dbe456da 100644 --- a/drivers/oprofile/cpu_buffer.c +++ b/drivers/oprofile/cpu_buffer.c @@ -186,14 +186,14 @@ int op_cpu_buffer_write_commit(struct op_entry *entry) struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu) { struct ring_buffer_event *e; - e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL); + e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL, NULL); if (e) goto event; if (ring_buffer_swap_cpu(op_ring_buffer_read, op_ring_buffer_write, cpu)) return NULL; - e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL); + e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL, NULL); if (e) goto event; return NULL; diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 5fcc31ed5771..c8297761e414 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -120,9 +120,11 @@ int ring_buffer_write(struct ring_buffer *buffer, unsigned long length, void *data); struct ring_buffer_event * -ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts); +ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts, + unsigned long *lost_events); struct ring_buffer_event * -ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts); +ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, + unsigned long *lost_events); struct ring_buffer_iter * ring_buffer_read_start(struct ring_buffer *buffer, int cpu); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d1187ef20caf..8295650444c5 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -318,6 +318,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define TS_MASK ((1ULL << TS_SHIFT) - 1) #define TS_DELTA_TEST (~TS_MASK) +/* Flag when events were overwritten */ +#define RB_MISSED_EVENTS (1 << 31) + struct buffer_data_page { u64 time_stamp; /* page time stamp */ local_t commit; /* write committed index */ @@ -416,6 +419,12 @@ int ring_buffer_print_page_header(struct trace_seq *s) (unsigned int)sizeof(field.commit), (unsigned int)is_signed_type(long)); + ret = trace_seq_printf(s, "\tfield: int overwrite;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + 1, + (unsigned int)is_signed_type(long)); + ret = trace_seq_printf(s, "\tfield: char data;\t" "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), data), @@ -439,6 +448,8 @@ struct ring_buffer_per_cpu { struct buffer_page *tail_page; /* write to tail */ struct buffer_page *commit_page; /* committed pages */ struct buffer_page *reader_page; + unsigned long lost_events; + unsigned long last_overrun; local_t commit_overrun; local_t overrun; local_t entries; @@ -2835,6 +2846,7 @@ static struct buffer_page * rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = NULL; + unsigned long overwrite; unsigned long flags; int nr_loops = 0; int ret; @@ -2895,6 +2907,18 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) /* The reader page will be pointing to the new head */ rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list); + /* + * We want to make sure we read the overruns after we set up our + * pointers to the next object. The writer side does a + * cmpxchg to cross pages which acts as the mb on the writer + * side. Note, the reader will constantly fail the swap + * while the writer is updating the pointers, so this + * guarantees that the overwrite recorded here is the one we + * want to compare with the last_overrun. + */ + smp_mb(); + overwrite = local_read(&(cpu_buffer->overrun)); + /* * Here's the tricky part. * @@ -2926,6 +2950,11 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page = reader; rb_reset_reader_page(cpu_buffer); + if (overwrite != cpu_buffer->last_overrun) { + cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun; + cpu_buffer->last_overrun = overwrite; + } + goto again; out: @@ -3002,8 +3031,14 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) rb_advance_iter(iter); } +static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer) +{ + return cpu_buffer->lost_events; +} + static struct ring_buffer_event * -rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts) +rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, + unsigned long *lost_events) { struct ring_buffer_event *event; struct buffer_page *reader; @@ -3055,6 +3090,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts) ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } + if (lost_events) + *lost_events = rb_lost_events(cpu_buffer); return event; default: @@ -3165,12 +3202,14 @@ static inline int rb_ok_to_lock(void) * @buffer: The ring buffer to read * @cpu: The cpu to peak at * @ts: The timestamp counter of this event. + * @lost_events: a variable to store if events were lost (may be NULL) * * This will return the event that will be read next, but does * not consume the data. */ struct ring_buffer_event * -ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) +ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts, + unsigned long *lost_events) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; @@ -3185,7 +3224,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) local_irq_save(flags); if (dolock) spin_lock(&cpu_buffer->reader_lock); - event = rb_buffer_peek(cpu_buffer, ts); + event = rb_buffer_peek(cpu_buffer, ts, lost_events); if (event && event->type_len == RINGBUF_TYPE_PADDING) rb_advance_reader(cpu_buffer); if (dolock) @@ -3227,13 +3266,17 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) /** * ring_buffer_consume - return an event and consume it * @buffer: The ring buffer to get the next event from + * @cpu: the cpu to read the buffer from + * @ts: a variable to store the timestamp (may be NULL) + * @lost_events: a variable to store if events were lost (may be NULL) * * Returns the next event in the ring buffer, and that event is consumed. * Meaning, that sequential reads will keep returning a different event, * and eventually empty the ring buffer if the producer is slower. */ struct ring_buffer_event * -ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) +ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, + unsigned long *lost_events) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event = NULL; @@ -3254,9 +3297,11 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) if (dolock) spin_lock(&cpu_buffer->reader_lock); - event = rb_buffer_peek(cpu_buffer, ts); - if (event) + event = rb_buffer_peek(cpu_buffer, ts, lost_events); + if (event) { + cpu_buffer->lost_events = 0; rb_advance_reader(cpu_buffer); + } if (dolock) spin_unlock(&cpu_buffer->reader_lock); @@ -3405,6 +3450,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->write_stamp = 0; cpu_buffer->read_stamp = 0; + cpu_buffer->lost_events = 0; + cpu_buffer->last_overrun = 0; + rb_head_page_activate(cpu_buffer); } @@ -3684,6 +3732,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, unsigned int commit; unsigned int read; u64 save_timestamp; + int missed_events = 0; int ret = -1; if (!cpumask_test_cpu(cpu, buffer->cpumask)) @@ -3716,6 +3765,10 @@ int ring_buffer_read_page(struct ring_buffer *buffer, read = reader->read; commit = rb_page_commit(reader); + /* Check if any events were dropped */ + if (cpu_buffer->lost_events) + missed_events = 1; + /* * If this page has been partially read or * if len is not big enough to read the rest of the page or @@ -3779,6 +3832,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer, } ret = read; + cpu_buffer->lost_events = 0; + /* + * Set a flag in the commit field if we lost events + */ + if (missed_events) + local_add(RB_MISSED_EVENTS, &bpage->commit); + out_unlock: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index df74c7982255..dc56556b55a2 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -81,7 +81,7 @@ static enum event_status read_event(int cpu) int *entry; u64 ts; - event = ring_buffer_consume(buffer, cpu, &ts); + event = ring_buffer_consume(buffer, cpu, &ts, NULL); if (!event) return EVENT_DROPPED; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3ec2ee6f6560..fabb0033a9be 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1556,7 +1556,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts) if (buf_iter) event = ring_buffer_iter_peek(buf_iter, ts); else - event = ring_buffer_peek(iter->tr->buffer, cpu, ts); + event = ring_buffer_peek(iter->tr->buffer, cpu, ts, NULL); ftrace_enable_cpu(); @@ -1635,7 +1635,7 @@ static void trace_consume(struct trace_iterator *iter) { /* Don't allow ftrace to trace into the ring buffers */ ftrace_disable_cpu(); - ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts); + ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, NULL); ftrace_enable_cpu(); } diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index e6989d9b44da..a7f75fb10aa4 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -489,9 +489,10 @@ get_return_for_leaf(struct trace_iterator *iter, * We need to consume the current entry to see * the next one. */ - ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); + ring_buffer_consume(iter->tr->buffer, iter->cpu, + NULL, NULL); event = ring_buffer_peek(iter->tr->buffer, iter->cpu, - NULL); + NULL, NULL); } if (!event) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 280fea470d67..e50180874c63 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -29,7 +29,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu) struct trace_entry *entry; unsigned int loops = 0; - while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) { + while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) { entry = ring_buffer_event_data(event); /* From bc21b478425ac73f66a5ec0b375a5e0d12d609ce Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 31 Mar 2010 19:49:26 -0400 Subject: [PATCH 07/24] tracing: Show the lost events in the trace_pipe output Now that the ring buffer can keep track of where events are lost. Use this information to the output of trace_pipe: hackbench-3588 [001] 1326.701660: lock_acquire: ffffffff816591e0 read rcu_read_lock hackbench-3588 [001] 1326.701661: lock_acquire: ffff88003f4091f0 &(&dentry->d_lock)->rlock hackbench-3588 [001] 1326.701664: lock_release: ffff88003f4091f0 &(&dentry->d_lock)->rlock CPU:1 [LOST 673 EVENTS] hackbench-3588 [001] 1326.702711: kmem_cache_free: call_site=ffffffff81102b85 ptr=ffff880026d96738 hackbench-3588 [001] 1326.702712: lock_release: ffff88003e1480a8 &mm->mmap_sem hackbench-3588 [001] 1326.702713: lock_acquire: ffff88003e1480a8 &mm->mmap_sem Even works with the function graph tracer: 2) ! 170.098 us | } 2) 4.036 us | rcu_irq_exit(); 2) 3.657 us | idle_cpu(); 2) ! 190.301 us | } CPU:2 [LOST 2196 EVENTS] 2) 0.853 us | } /* cancel_dirty_page */ 2) | remove_from_page_cache() { 2) 1.578 us | _raw_spin_lock_irq(); 2) | __remove_from_page_cache() { Note, it does not work with the iterator "trace" file, since it requires the use of consuming the page from the ring buffer to determine how many events were lost, which the iterator does not do. Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 1 + kernel/trace/trace.c | 30 ++++++++++++++++++++++-------- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index c0f4b364c711..39e71b0a3bfd 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -58,6 +58,7 @@ struct trace_iterator { /* The below is zeroed out in pipe_read */ struct trace_seq seq; struct trace_entry *ent; + unsigned long lost_events; int leftover; int cpu; u64 ts; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fabb0033a9be..0498bebcbfd1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1545,7 +1545,8 @@ static void trace_iterator_increment(struct trace_iterator *iter) } static struct trace_entry * -peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts) +peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, + unsigned long *lost_events) { struct ring_buffer_event *event; struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; @@ -1556,7 +1557,8 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts) if (buf_iter) event = ring_buffer_iter_peek(buf_iter, ts); else - event = ring_buffer_peek(iter->tr->buffer, cpu, ts, NULL); + event = ring_buffer_peek(iter->tr->buffer, cpu, ts, + lost_events); ftrace_enable_cpu(); @@ -1564,10 +1566,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts) } static struct trace_entry * -__find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) +__find_next_entry(struct trace_iterator *iter, int *ent_cpu, + unsigned long *missing_events, u64 *ent_ts) { struct ring_buffer *buffer = iter->tr->buffer; struct trace_entry *ent, *next = NULL; + unsigned long lost_events, next_lost = 0; int cpu_file = iter->cpu_file; u64 next_ts = 0, ts; int next_cpu = -1; @@ -1580,7 +1584,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) if (cpu_file > TRACE_PIPE_ALL_CPU) { if (ring_buffer_empty_cpu(buffer, cpu_file)) return NULL; - ent = peek_next_entry(iter, cpu_file, ent_ts); + ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events); if (ent_cpu) *ent_cpu = cpu_file; @@ -1592,7 +1596,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) if (ring_buffer_empty_cpu(buffer, cpu)) continue; - ent = peek_next_entry(iter, cpu, &ts); + ent = peek_next_entry(iter, cpu, &ts, &lost_events); /* * Pick the entry with the smallest timestamp: @@ -1601,6 +1605,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) next = ent; next_cpu = cpu; next_ts = ts; + next_lost = lost_events; } } @@ -1610,6 +1615,9 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) if (ent_ts) *ent_ts = next_ts; + if (missing_events) + *missing_events = next_lost; + return next; } @@ -1617,13 +1625,14 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) { - return __find_next_entry(iter, ent_cpu, ent_ts); + return __find_next_entry(iter, ent_cpu, NULL, ent_ts); } /* Find the next real entry, and increment the iterator to the next entry */ static void *find_next_entry_inc(struct trace_iterator *iter) { - iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts); + iter->ent = __find_next_entry(iter, &iter->cpu, + &iter->lost_events, &iter->ts); if (iter->ent) trace_iterator_increment(iter); @@ -1635,7 +1644,8 @@ static void trace_consume(struct trace_iterator *iter) { /* Don't allow ftrace to trace into the ring buffers */ ftrace_disable_cpu(); - ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, NULL); + ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, + &iter->lost_events); ftrace_enable_cpu(); } @@ -2030,6 +2040,10 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter) { enum print_line_t ret; + if (iter->lost_events) + trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", + iter->cpu, iter->lost_events); + if (iter->trace && iter->trace->print_line) { ret = iter->trace->print_line(iter); if (ret != TRACE_TYPE_UNHANDLED) From ff0ff84a0767df48d728c36510365344a7e7d582 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 31 Mar 2010 22:11:42 -0400 Subject: [PATCH 08/24] ring-buffer: Add lost event count to end of sub buffer Currently, binary readers of the ring buffer only know where events were lost, but not how many events were lost at that location. This information is available, but it would require adding another field to the sub buffer header to include it. But when a event can not fit at the end of a sub buffer, it is written to the next sub buffer. This means there is a good chance that the buffer may have room to hold this counter. If it does, write the counter at the end of the sub buffer and set another flag in the data size field that states that this information exists. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 8295650444c5..dc6d563a6d22 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -320,6 +320,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); /* Flag when events were overwritten */ #define RB_MISSED_EVENTS (1 << 31) +/* Missed count stored at end */ +#define RB_MISSED_STORED (1 << 30) struct buffer_data_page { u64 time_stamp; /* page time stamp */ @@ -340,6 +342,7 @@ struct buffer_page { local_t write; /* index for next write */ unsigned read; /* index for next read */ local_t entries; /* entries on this page */ + unsigned long real_end; /* real end of data */ struct buffer_data_page *page; /* Actual data page */ }; @@ -1769,6 +1772,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, event = __rb_page_index(tail_page, tail); kmemcheck_annotate_bitfield(event, bitfield); + /* + * Save the original length to the meta data. + * This will be used by the reader to add lost event + * counter. + */ + tail_page->real_end = tail; + /* * If this event is bigger than the minimum size, then * we need to be careful that we don't subtract the @@ -2888,6 +2898,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->reader_page->write, 0); local_set(&cpu_buffer->reader_page->entries, 0); local_set(&cpu_buffer->reader_page->page->commit, 0); + cpu_buffer->reader_page->real_end = 0; spin: /* @@ -3728,11 +3739,11 @@ int ring_buffer_read_page(struct ring_buffer *buffer, struct ring_buffer_event *event; struct buffer_data_page *bpage; struct buffer_page *reader; + unsigned long missed_events; unsigned long flags; unsigned int commit; unsigned int read; u64 save_timestamp; - int missed_events = 0; int ret = -1; if (!cpumask_test_cpu(cpu, buffer->cpumask)) @@ -3766,8 +3777,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, commit = rb_page_commit(reader); /* Check if any events were dropped */ - if (cpu_buffer->lost_events) - missed_events = 1; + missed_events = cpu_buffer->lost_events; /* * If this page has been partially read or @@ -3829,6 +3839,14 @@ int ring_buffer_read_page(struct ring_buffer *buffer, local_set(&reader->entries, 0); reader->read = 0; *data_page = bpage; + + /* + * Use the real_end for the data size, + * This gives us a chance to store the lost events + * on the page. + */ + if (reader->real_end) + local_set(&bpage->commit, reader->real_end); } ret = read; @@ -3836,8 +3854,19 @@ int ring_buffer_read_page(struct ring_buffer *buffer, /* * Set a flag in the commit field if we lost events */ - if (missed_events) + if (missed_events) { + commit = local_read(&bpage->commit); + + /* If there is room at the end of the page to save the + * missed events, then record it there. + */ + if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) { + memcpy(&bpage->data[commit], &missed_events, + sizeof(missed_events)); + local_add(RB_MISSED_STORED, &bpage->commit); + } local_add(RB_MISSED_EVENTS, &bpage->commit); + } out_unlock: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); From cb6e943ccf19ab6d3189147e9d625a992e016084 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 1 Apr 2010 03:17:25 +0200 Subject: [PATCH 09/24] oprofile: remove double ring buffering oprofile used a double buffer scheme for its cpu event buffer to avoid races on reading with the old locked ring buffer. But that is obsolete now with the new ring buffer, so simply use a single buffer. This greatly simplifies the code and avoids a lot of sample drops on large runs, especially with call graph. Based on suggestions from Steven Rostedt For stable kernels from v2.6.32, but not earlier. Signed-off-by: Andi Kleen Cc: Steven Rostedt Cc: stable Signed-off-by: Robert Richter --- drivers/oprofile/cpu_buffer.c | 63 ++++++++--------------------------- 1 file changed, 13 insertions(+), 50 deletions(-) diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c index 166b67ea622f..de82183bb9b3 100644 --- a/drivers/oprofile/cpu_buffer.c +++ b/drivers/oprofile/cpu_buffer.c @@ -30,23 +30,7 @@ #define OP_BUFFER_FLAGS 0 -/* - * Read and write access is using spin locking. Thus, writing to the - * buffer by NMI handler (x86) could occur also during critical - * sections when reading the buffer. To avoid this, there are 2 - * buffers for independent read and write access. Read access is in - * process context only, write access only in the NMI handler. If the - * read buffer runs empty, both buffers are swapped atomically. There - * is potentially a small window during swapping where the buffers are - * disabled and samples could be lost. - * - * Using 2 buffers is a little bit overhead, but the solution is clear - * and does not require changes in the ring buffer implementation. It - * can be changed to a single buffer solution when the ring buffer - * access is implemented as non-locking atomic code. - */ -static struct ring_buffer *op_ring_buffer_read; -static struct ring_buffer *op_ring_buffer_write; +static struct ring_buffer *op_ring_buffer; DEFINE_PER_CPU(struct oprofile_cpu_buffer, op_cpu_buffer); static void wq_sync_buffer(struct work_struct *work); @@ -68,12 +52,9 @@ void oprofile_cpu_buffer_inc_smpl_lost(void) void free_cpu_buffers(void) { - if (op_ring_buffer_read) - ring_buffer_free(op_ring_buffer_read); - op_ring_buffer_read = NULL; - if (op_ring_buffer_write) - ring_buffer_free(op_ring_buffer_write); - op_ring_buffer_write = NULL; + if (op_ring_buffer) + ring_buffer_free(op_ring_buffer); + op_ring_buffer = NULL; } #define RB_EVENT_HDR_SIZE 4 @@ -86,11 +67,8 @@ int alloc_cpu_buffers(void) unsigned long byte_size = buffer_size * (sizeof(struct op_sample) + RB_EVENT_HDR_SIZE); - op_ring_buffer_read = ring_buffer_alloc(byte_size, OP_BUFFER_FLAGS); - if (!op_ring_buffer_read) - goto fail; - op_ring_buffer_write = ring_buffer_alloc(byte_size, OP_BUFFER_FLAGS); - if (!op_ring_buffer_write) + op_ring_buffer = ring_buffer_alloc(byte_size, OP_BUFFER_FLAGS); + if (!op_ring_buffer) goto fail; for_each_possible_cpu(i) { @@ -162,16 +140,11 @@ struct op_sample *op_cpu_buffer_write_reserve(struct op_entry *entry, unsigned long size) { entry->event = ring_buffer_lock_reserve - (op_ring_buffer_write, sizeof(struct op_sample) + + (op_ring_buffer, sizeof(struct op_sample) + size * sizeof(entry->sample->data[0])); - if (entry->event) - entry->sample = ring_buffer_event_data(entry->event); - else - entry->sample = NULL; - - if (!entry->sample) + if (!entry->event) return NULL; - + entry->sample = ring_buffer_event_data(entry->event); entry->size = size; entry->data = entry->sample->data; @@ -180,25 +153,16 @@ struct op_sample int op_cpu_buffer_write_commit(struct op_entry *entry) { - return ring_buffer_unlock_commit(op_ring_buffer_write, entry->event); + return ring_buffer_unlock_commit(op_ring_buffer, entry->event); } struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu) { struct ring_buffer_event *e; - e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL); - if (e) - goto event; - if (ring_buffer_swap_cpu(op_ring_buffer_read, - op_ring_buffer_write, - cpu)) + e = ring_buffer_consume(op_ring_buffer, cpu, NULL); + if (!e) return NULL; - e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL); - if (e) - goto event; - return NULL; -event: entry->event = e; entry->sample = ring_buffer_event_data(e); entry->size = (ring_buffer_event_length(e) - sizeof(struct op_sample)) @@ -209,8 +173,7 @@ event: unsigned long op_cpu_buffer_entries(int cpu) { - return ring_buffer_entries_cpu(op_ring_buffer_read, cpu) - + ring_buffer_entries_cpu(op_ring_buffer_write, cpu); + return ring_buffer_entries_cpu(op_ring_buffer, cpu); } static int From 9414e99672271adcc661f3c160a30b374179b92f Mon Sep 17 00:00:00 2001 From: Phil Carmody Date: Wed, 28 Apr 2010 12:09:16 -0500 Subject: [PATCH 10/24] oprofile: protect from not being in an IRQ context http://lkml.org/lkml/2010/4/27/285 Protect against dereferencing regs when it's NULL, and force a magic number into pc to prevent too deep processing. This approach permits the dropped samples to be tallied as invalid Instruction Pointer events. e.g. output from about 15mins at 10kHz sample rate: Nr. samples received: 2565380 Nr. samples lost invalid pc: 4 Signed-off-by: Phil Carmody Signed-off-by: Robert Richter --- drivers/oprofile/cpu_buffer.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c index 0ac8b065ee02..219f79e2210a 100644 --- a/drivers/oprofile/cpu_buffer.c +++ b/drivers/oprofile/cpu_buffer.c @@ -319,8 +319,16 @@ void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs, void oprofile_add_sample(struct pt_regs * const regs, unsigned long event) { - int is_kernel = !user_mode(regs); - unsigned long pc = profile_pc(regs); + int is_kernel; + unsigned long pc; + + if (likely(regs)) { + is_kernel = !user_mode(regs); + pc = profile_pc(regs); + } else { + is_kernel = 0; /* This value will not be used */ + pc = ESCAPE_CODE; /* as this causes an early return. */ + } __oprofile_add_ext_sample(pc, regs, event, is_kernel); } From 81c4a8a6733ad2ff49c0e077b51403367601b3e7 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 22 Apr 2010 19:14:49 +0200 Subject: [PATCH 11/24] oprofile: update file list in MAINTAINERS file File list now catches: $ xargs | eval ls -d $(cat) | sort -u arch/*/include/asm/oprofile*.h arch/*/oprofile/ drivers/oprofile/ include/linux/oprofile.h arch/alpha/oprofile/ arch/arm/oprofile/ arch/avr32/oprofile/ arch/blackfin/oprofile/ arch/ia64/oprofile/ arch/m32r/oprofile/ arch/microblaze/oprofile/ arch/mips/oprofile/ arch/mn10300/oprofile/ arch/parisc/oprofile/ arch/powerpc/include/asm/oprofile_impl.h arch/powerpc/oprofile/ arch/s390/oprofile/ arch/sh/oprofile/ arch/sparc/oprofile/ arch/x86/oprofile/ drivers/oprofile/ include/linux/oprofile.h Signed-off-by: Robert Richter --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index a0e3c3a47a51..31e858617d0b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4165,6 +4165,7 @@ OPROFILE M: Robert Richter L: oprofile-list@lists.sf.net S: Maintained +F: arch/*/include/asm/oprofile*.h F: arch/*/oprofile/ F: drivers/oprofile/ F: include/linux/oprofile.h From 8f5a2dd83a1f8e89fdc17eb0f2f07c2e713e635a Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 23 Mar 2010 19:09:51 +0100 Subject: [PATCH 12/24] oprofile/x86: rework error handler in nmi_setup() This patch improves the error handler in nmi_setup(). Most parts of the code are moved to allocate_msrs(). In case of an error allocate_msrs() also frees already allocated memory. nmi_setup() becomes easier and better extendable. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 2c505ee71014..c0c21f200faf 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -295,6 +295,7 @@ static void free_msrs(void) kfree(per_cpu(cpu_msrs, i).controls); per_cpu(cpu_msrs, i).controls = NULL; } + nmi_shutdown_mux(); } static int allocate_msrs(void) @@ -307,14 +308,21 @@ static int allocate_msrs(void) per_cpu(cpu_msrs, i).counters = kzalloc(counters_size, GFP_KERNEL); if (!per_cpu(cpu_msrs, i).counters) - return 0; + goto fail; per_cpu(cpu_msrs, i).controls = kzalloc(controls_size, GFP_KERNEL); if (!per_cpu(cpu_msrs, i).controls) - return 0; + goto fail; } + if (!nmi_setup_mux()) + goto fail; + return 1; + +fail: + free_msrs(); + return 0; } static void nmi_cpu_setup(void *dummy) @@ -342,17 +350,7 @@ static int nmi_setup(void) int cpu; if (!allocate_msrs()) - err = -ENOMEM; - else if (!nmi_setup_mux()) - err = -ENOMEM; - else - err = register_die_notifier(&profile_exceptions_nb); - - if (err) { - free_msrs(); - nmi_shutdown_mux(); - return err; - } + return -ENOMEM; /* We need to serialize save and setup for HT because the subset * of msrs are distinct for save and setup operations @@ -374,9 +372,17 @@ static int nmi_setup(void) mux_clone(cpu); } + + err = register_die_notifier(&profile_exceptions_nb); + if (err) + goto fail; + on_each_cpu(nmi_cpu_setup, NULL, 1); nmi_enabled = 1; return 0; +fail: + free_msrs(); + return err; } static void nmi_cpu_restore_registers(struct op_msrs *msrs) @@ -421,7 +427,6 @@ static void nmi_shutdown(void) nmi_enabled = 0; on_each_cpu(nmi_cpu_shutdown, NULL, 1); unregister_die_notifier(&profile_exceptions_nb); - nmi_shutdown_mux(); msrs = &get_cpu_var(cpu_msrs); model->shutdown(msrs); free_msrs(); From d0e4120fda6f87eead438eed4d49032e12060e58 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 23 Mar 2010 19:33:21 +0100 Subject: [PATCH 13/24] oprofile/x86: reserve counter msrs pairwise For AMD's and Intel's P6 generic performance counters have pairwise counter and control msrs. This patch changes the counter reservation in a way that both msrs must be registered. It joins some counter loops and also removes the unnecessary NUM_CONTROLS macro in the AMD implementation. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 43 ++++++++++++++----------------- arch/x86/oprofile/op_model_ppro.c | 36 ++++++++++++-------------- 2 files changed, 36 insertions(+), 43 deletions(-) diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 090cbbec7dbd..2e2bc902b867 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -30,13 +30,10 @@ #include "op_counter.h" #define NUM_COUNTERS 4 -#define NUM_CONTROLS 4 #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX #define NUM_VIRT_COUNTERS 32 -#define NUM_VIRT_CONTROLS 32 #else #define NUM_VIRT_COUNTERS NUM_COUNTERS -#define NUM_VIRT_CONTROLS NUM_CONTROLS #endif #define OP_EVENT_MASK 0x0FFF @@ -134,13 +131,15 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs) int i; for (i = 0; i < NUM_COUNTERS; i++) { - if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) - msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; - } - - for (i = 0; i < NUM_CONTROLS; i++) { - if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) - msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; + if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) + continue; + if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) { + release_perfctr_nmi(MSR_K7_PERFCTR0 + i); + continue; + } + /* both registers must be reserved */ + msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; + msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; } } @@ -160,7 +159,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, } /* clear all counters */ - for (i = 0; i < NUM_CONTROLS; ++i) { + for (i = 0; i < NUM_COUNTERS; ++i) { if (unlikely(!msrs->controls[i].addr)) { if (counter_config[i].enabled && !smp_processor_id()) /* @@ -175,12 +174,10 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, op_x86_warn_in_use(i); val &= model->reserved; wrmsrl(msrs->controls[i].addr, val); - } - - /* avoid a false detection of ctr overflows in NMI handler */ - for (i = 0; i < NUM_COUNTERS; ++i) { - if (unlikely(!msrs->counters[i].addr)) - continue; + /* + * avoid a false detection of ctr overflows in NMI + * handler + */ wrmsrl(msrs->counters[i].addr, -1LL); } @@ -430,12 +427,10 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) int i; for (i = 0; i < NUM_COUNTERS; ++i) { - if (msrs->counters[i].addr) - release_perfctr_nmi(MSR_K7_PERFCTR0 + i); - } - for (i = 0; i < NUM_CONTROLS; ++i) { - if (msrs->controls[i].addr) - release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + if (!msrs->counters[i].addr) + continue; + release_perfctr_nmi(MSR_K7_PERFCTR0 + i); + release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); } } @@ -583,7 +578,7 @@ static void op_amd_exit(void) struct op_x86_model_spec op_amd_spec = { .num_counters = NUM_COUNTERS, - .num_controls = NUM_CONTROLS, + .num_controls = NUM_COUNTERS, .num_virt_counters = NUM_VIRT_COUNTERS, .reserved = MSR_AMD_EVENTSEL_RESERVED, .event_mask = OP_EVENT_MASK, diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 2bf90fafa7b5..f8e268e8e992 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -35,13 +35,15 @@ static void ppro_fill_in_addresses(struct op_msrs * const msrs) int i; for (i = 0; i < num_counters; i++) { - if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i)) - msrs->counters[i].addr = MSR_P6_PERFCTR0 + i; - } - - for (i = 0; i < num_counters; i++) { - if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) - msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i; + if (!reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i)) + continue; + if (!reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) { + release_perfctr_nmi(MSR_P6_PERFCTR0 + i); + continue; + } + /* both registers must be reserved */ + msrs->counters[i].addr = MSR_P6_PERFCTR0 + i; + msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i; } } @@ -92,12 +94,10 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model, op_x86_warn_in_use(i); val &= model->reserved; wrmsrl(msrs->controls[i].addr, val); - } - - /* avoid a false detection of ctr overflows in NMI handler */ - for (i = 0; i < num_counters; ++i) { - if (unlikely(!msrs->counters[i].addr)) - continue; + /* + * avoid a false detection of ctr overflows in NMI * + * handler + */ wrmsrl(msrs->counters[i].addr, -1LL); } @@ -194,12 +194,10 @@ static void ppro_shutdown(struct op_msrs const * const msrs) int i; for (i = 0; i < num_counters; ++i) { - if (msrs->counters[i].addr) - release_perfctr_nmi(MSR_P6_PERFCTR0 + i); - } - for (i = 0; i < num_counters; ++i) { - if (msrs->controls[i].addr) - release_evntsel_nmi(MSR_P6_EVNTSEL0 + i); + if (!msrs->counters[i].addr) + continue; + release_perfctr_nmi(MSR_P6_PERFCTR0 + i); + release_evntsel_nmi(MSR_P6_EVNTSEL0 + i); } if (reset_value) { kfree(reset_value); From 83300ce0df6b72e156b386457aa0f0902b8c0a98 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 23 Mar 2010 20:01:54 +0100 Subject: [PATCH 14/24] oprofile/x86: moving shutdown functions Moving some code in preparation of the next patch. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 24 +++++++++---------- arch/x86/oprofile/op_model_p4.c | 38 +++++++++++++++---------------- arch/x86/oprofile/op_model_ppro.c | 33 +++++++++++++-------------- 3 files changed, 46 insertions(+), 49 deletions(-) diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 2e2bc902b867..7e5886d54bd5 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -126,6 +126,18 @@ static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, /* functions for op_amd_spec */ +static void op_amd_shutdown(struct op_msrs const * const msrs) +{ + int i; + + for (i = 0; i < NUM_COUNTERS; ++i) { + if (!msrs->counters[i].addr) + continue; + release_perfctr_nmi(MSR_K7_PERFCTR0 + i); + release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } +} + static void op_amd_fill_in_addresses(struct op_msrs * const msrs) { int i; @@ -422,18 +434,6 @@ static void op_amd_stop(struct op_msrs const * const msrs) op_amd_stop_ibs(); } -static void op_amd_shutdown(struct op_msrs const * const msrs) -{ - int i; - - for (i = 0; i < NUM_COUNTERS; ++i) { - if (!msrs->counters[i].addr) - continue; - release_perfctr_nmi(MSR_K7_PERFCTR0 + i); - release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } -} - static u8 ibs_eilvt_off; static inline void apic_init_ibs_nmi_per_cpu(void *arg) diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index e6a160a4684a..7cc80df330d5 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -385,6 +385,24 @@ static unsigned int get_stagger(void) static unsigned long reset_value[NUM_COUNTERS_NON_HT]; +static void p4_shutdown(struct op_msrs const * const msrs) +{ + int i; + + for (i = 0; i < num_counters; ++i) { + if (msrs->counters[i].addr) + release_perfctr_nmi(msrs->counters[i].addr); + } + /* + * some of the control registers are specially reserved in + * conjunction with the counter registers (hence the starting offset). + * This saves a few bits. + */ + for (i = num_counters; i < num_controls; ++i) { + if (msrs->controls[i].addr) + release_evntsel_nmi(msrs->controls[i].addr); + } +} static void p4_fill_in_addresses(struct op_msrs * const msrs) { @@ -668,26 +686,6 @@ static void p4_stop(struct op_msrs const * const msrs) } } -static void p4_shutdown(struct op_msrs const * const msrs) -{ - int i; - - for (i = 0; i < num_counters; ++i) { - if (msrs->counters[i].addr) - release_perfctr_nmi(msrs->counters[i].addr); - } - /* - * some of the control registers are specially reserved in - * conjunction with the counter registers (hence the starting offset). - * This saves a few bits. - */ - for (i = num_counters; i < num_controls; ++i) { - if (msrs->controls[i].addr) - release_evntsel_nmi(msrs->controls[i].addr); - } -} - - #ifdef CONFIG_SMP struct op_x86_model_spec op_p4_ht2_spec = { .num_counters = NUM_COUNTERS_HT2, diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index f8e268e8e992..b07d25a52f02 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -30,6 +30,22 @@ static int counter_width = 32; static u64 *reset_value; +static void ppro_shutdown(struct op_msrs const * const msrs) +{ + int i; + + for (i = 0; i < num_counters; ++i) { + if (!msrs->counters[i].addr) + continue; + release_perfctr_nmi(MSR_P6_PERFCTR0 + i); + release_evntsel_nmi(MSR_P6_EVNTSEL0 + i); + } + if (reset_value) { + kfree(reset_value); + reset_value = NULL; + } +} + static void ppro_fill_in_addresses(struct op_msrs * const msrs) { int i; @@ -189,23 +205,6 @@ static void ppro_stop(struct op_msrs const * const msrs) } } -static void ppro_shutdown(struct op_msrs const * const msrs) -{ - int i; - - for (i = 0; i < num_counters; ++i) { - if (!msrs->counters[i].addr) - continue; - release_perfctr_nmi(MSR_P6_PERFCTR0 + i); - release_evntsel_nmi(MSR_P6_EVNTSEL0 + i); - } - if (reset_value) { - kfree(reset_value); - reset_value = NULL; - } -} - - struct op_x86_model_spec op_ppro_spec = { .num_counters = 2, .num_controls = 2, From 8617f98c001d00b176422d707e6a67b88bcd7e0d Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 26 Feb 2010 17:20:55 +0100 Subject: [PATCH 15/24] oprofile/x86: return -EBUSY if counters are already reserved In case a counter is already reserved by the watchdog or perf_event subsystem, oprofile ignored this counters silently. This case is handled now and oprofile_setup() now reports an error. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 5 ++++- arch/x86/oprofile/op_model_amd.c | 24 +++++++++++++----------- arch/x86/oprofile/op_model_p4.c | 14 +++++++++++++- arch/x86/oprofile/op_model_ppro.c | 24 +++++++++++++----------- arch/x86/oprofile/op_x86_model.h | 2 +- 5 files changed, 44 insertions(+), 25 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index c0c21f200faf..9f001d904599 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -357,7 +357,10 @@ static int nmi_setup(void) */ /* Assume saved/restored counters are the same on all CPUs */ - model->fill_in_addresses(&per_cpu(cpu_msrs, 0)); + err = model->fill_in_addresses(&per_cpu(cpu_msrs, 0)); + if (err) + goto fail; + for_each_possible_cpu(cpu) { if (!cpu) continue; diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 7e5886d54bd5..536d0b0b39a5 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -138,21 +138,30 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) } } -static void op_amd_fill_in_addresses(struct op_msrs * const msrs) +static int op_amd_fill_in_addresses(struct op_msrs * const msrs) { int i; for (i = 0; i < NUM_COUNTERS; i++) { if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) - continue; + goto fail; if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) { release_perfctr_nmi(MSR_K7_PERFCTR0 + i); - continue; + goto fail; } /* both registers must be reserved */ msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; + continue; + fail: + if (!counter_config[i].enabled) + continue; + op_x86_warn_reserved(i); + op_amd_shutdown(msrs); + return -EBUSY; } + + return 0; } static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, @@ -172,15 +181,8 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, /* clear all counters */ for (i = 0; i < NUM_COUNTERS; ++i) { - if (unlikely(!msrs->controls[i].addr)) { - if (counter_config[i].enabled && !smp_processor_id()) - /* - * counter is reserved, this is on all - * cpus, so report only for cpu #0 - */ - op_x86_warn_reserved(i); + if (!msrs->controls[i].addr) continue; - } rdmsrl(msrs->controls[i].addr, val); if (val & ARCH_PERFMON_EVENTSEL_ENABLE) op_x86_warn_in_use(i); diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 7cc80df330d5..182558dd5515 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -404,7 +404,7 @@ static void p4_shutdown(struct op_msrs const * const msrs) } } -static void p4_fill_in_addresses(struct op_msrs * const msrs) +static int p4_fill_in_addresses(struct op_msrs * const msrs) { unsigned int i; unsigned int addr, cccraddr, stag; @@ -486,6 +486,18 @@ static void p4_fill_in_addresses(struct op_msrs * const msrs) msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; } } + + for (i = 0; i < num_counters; ++i) { + if (!counter_config[i].enabled) + continue; + if (msrs->controls[i].addr) + continue; + op_x86_warn_reserved(i); + p4_shutdown(msrs); + return -EBUSY; + } + + return 0; } diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index b07d25a52f02..1fd17cfb956b 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -46,21 +46,30 @@ static void ppro_shutdown(struct op_msrs const * const msrs) } } -static void ppro_fill_in_addresses(struct op_msrs * const msrs) +static int ppro_fill_in_addresses(struct op_msrs * const msrs) { int i; for (i = 0; i < num_counters; i++) { if (!reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i)) - continue; + goto fail; if (!reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) { release_perfctr_nmi(MSR_P6_PERFCTR0 + i); - continue; + goto fail; } /* both registers must be reserved */ msrs->counters[i].addr = MSR_P6_PERFCTR0 + i; msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i; + continue; + fail: + if (!counter_config[i].enabled) + continue; + op_x86_warn_reserved(i); + ppro_shutdown(msrs); + return -EBUSY; } + + return 0; } @@ -96,15 +105,8 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model, /* clear all counters */ for (i = 0; i < num_counters; ++i) { - if (unlikely(!msrs->controls[i].addr)) { - if (counter_config[i].enabled && !smp_processor_id()) - /* - * counter is reserved, this is on all - * cpus, so report only for cpu #0 - */ - op_x86_warn_reserved(i); + if (!msrs->controls[i].addr) continue; - } rdmsrl(msrs->controls[i].addr, val); if (val & ARCH_PERFMON_EVENTSEL_ENABLE) op_x86_warn_in_use(i); diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index ff82a755edd4..551401398fba 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -41,7 +41,7 @@ struct op_x86_model_spec { u16 event_mask; int (*init)(struct oprofile_operations *ops); void (*exit)(void); - void (*fill_in_addresses)(struct op_msrs * const msrs); + int (*fill_in_addresses)(struct op_msrs * const msrs); void (*setup_ctrs)(struct op_x86_model_spec const *model, struct op_msrs const * const msrs); int (*check_ctrs)(struct pt_regs * const regs, From da759fe5be24ec3b236a76c007b460cf6caf2009 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 26 Feb 2010 10:54:56 +0100 Subject: [PATCH 16/24] oprofile/x86: move IBS code Moving code to make future changes easier. This groups all IBS code together. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 220 +++++++++++++++---------------- 1 file changed, 110 insertions(+), 110 deletions(-) diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 536d0b0b39a5..e159254fb7cd 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -102,116 +102,6 @@ static u32 get_ibs_caps(void) return ibs_caps; } -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - -static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, - struct op_msrs const * const msrs) -{ - u64 val; - int i; - - /* enable active counters */ - for (i = 0; i < NUM_COUNTERS; ++i) { - int virt = op_x86_phys_to_virt(i); - if (!reset_value[virt]) - continue; - rdmsrl(msrs->controls[i].addr, val); - val &= model->reserved; - val |= op_x86_get_ctrl(model, &counter_config[virt]); - wrmsrl(msrs->controls[i].addr, val); - } -} - -#endif - -/* functions for op_amd_spec */ - -static void op_amd_shutdown(struct op_msrs const * const msrs) -{ - int i; - - for (i = 0; i < NUM_COUNTERS; ++i) { - if (!msrs->counters[i].addr) - continue; - release_perfctr_nmi(MSR_K7_PERFCTR0 + i); - release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } -} - -static int op_amd_fill_in_addresses(struct op_msrs * const msrs) -{ - int i; - - for (i = 0; i < NUM_COUNTERS; i++) { - if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) - goto fail; - if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) { - release_perfctr_nmi(MSR_K7_PERFCTR0 + i); - goto fail; - } - /* both registers must be reserved */ - msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; - msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; - continue; - fail: - if (!counter_config[i].enabled) - continue; - op_x86_warn_reserved(i); - op_amd_shutdown(msrs); - return -EBUSY; - } - - return 0; -} - -static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, - struct op_msrs const * const msrs) -{ - u64 val; - int i; - - /* setup reset_value */ - for (i = 0; i < NUM_VIRT_COUNTERS; ++i) { - if (counter_config[i].enabled - && msrs->counters[op_x86_virt_to_phys(i)].addr) - reset_value[i] = counter_config[i].count; - else - reset_value[i] = 0; - } - - /* clear all counters */ - for (i = 0; i < NUM_COUNTERS; ++i) { - if (!msrs->controls[i].addr) - continue; - rdmsrl(msrs->controls[i].addr, val); - if (val & ARCH_PERFMON_EVENTSEL_ENABLE) - op_x86_warn_in_use(i); - val &= model->reserved; - wrmsrl(msrs->controls[i].addr, val); - /* - * avoid a false detection of ctr overflows in NMI - * handler - */ - wrmsrl(msrs->counters[i].addr, -1LL); - } - - /* enable active counters */ - for (i = 0; i < NUM_COUNTERS; ++i) { - int virt = op_x86_phys_to_virt(i); - if (!reset_value[virt]) - continue; - - /* setup counter registers */ - wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]); - - /* setup control registers */ - rdmsrl(msrs->controls[i].addr, val); - val &= model->reserved; - val |= op_x86_get_ctrl(model, &counter_config[virt]); - wrmsrl(msrs->controls[i].addr, val); - } -} - /* * 16-bit Linear Feedback Shift Register (LFSR) * @@ -376,6 +266,116 @@ static void op_amd_stop_ibs(void) wrmsrl(MSR_AMD64_IBSOPCTL, 0); } +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + +static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs) +{ + u64 val; + int i; + + /* enable active counters */ + for (i = 0; i < NUM_COUNTERS; ++i) { + int virt = op_x86_phys_to_virt(i); + if (!reset_value[virt]) + continue; + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + val |= op_x86_get_ctrl(model, &counter_config[virt]); + wrmsrl(msrs->controls[i].addr, val); + } +} + +#endif + +/* functions for op_amd_spec */ + +static void op_amd_shutdown(struct op_msrs const * const msrs) +{ + int i; + + for (i = 0; i < NUM_COUNTERS; ++i) { + if (!msrs->counters[i].addr) + continue; + release_perfctr_nmi(MSR_K7_PERFCTR0 + i); + release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } +} + +static int op_amd_fill_in_addresses(struct op_msrs * const msrs) +{ + int i; + + for (i = 0; i < NUM_COUNTERS; i++) { + if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) + goto fail; + if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) { + release_perfctr_nmi(MSR_K7_PERFCTR0 + i); + goto fail; + } + /* both registers must be reserved */ + msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; + msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; + continue; + fail: + if (!counter_config[i].enabled) + continue; + op_x86_warn_reserved(i); + op_amd_shutdown(msrs); + return -EBUSY; + } + + return 0; +} + +static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs) +{ + u64 val; + int i; + + /* setup reset_value */ + for (i = 0; i < NUM_VIRT_COUNTERS; ++i) { + if (counter_config[i].enabled + && msrs->counters[op_x86_virt_to_phys(i)].addr) + reset_value[i] = counter_config[i].count; + else + reset_value[i] = 0; + } + + /* clear all counters */ + for (i = 0; i < NUM_COUNTERS; ++i) { + if (!msrs->controls[i].addr) + continue; + rdmsrl(msrs->controls[i].addr, val); + if (val & ARCH_PERFMON_EVENTSEL_ENABLE) + op_x86_warn_in_use(i); + val &= model->reserved; + wrmsrl(msrs->controls[i].addr, val); + /* + * avoid a false detection of ctr overflows in NMI + * handler + */ + wrmsrl(msrs->counters[i].addr, -1LL); + } + + /* enable active counters */ + for (i = 0; i < NUM_COUNTERS; ++i) { + int virt = op_x86_phys_to_virt(i); + if (!reset_value[virt]) + continue; + + /* setup counter registers */ + wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]); + + /* setup control registers */ + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + val |= op_x86_get_ctrl(model, &counter_config[virt]); + wrmsrl(msrs->controls[i].addr, val); + } +} + static int op_amd_check_ctrs(struct pt_regs * const regs, struct op_msrs const * const msrs) { From 5bdb7934ca4115a12c7d585c5a45312b1c36909b Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 31 Mar 2010 11:58:36 +0200 Subject: [PATCH 17/24] oprofile/x86: remove duplicate IBS capability check The check is already done in ibs_exit(). Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index e159254fb7cd..384c52410480 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -490,8 +490,7 @@ static int init_ibs_nmi(void) /* uninitialize the APIC for the IBS interrupts if needed */ static void clear_ibs_nmi(void) { - if (ibs_caps) - on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); + on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); } /* initialize the APIC for the IBS interrupts if available */ From 2623a1d55a6260c855e1f6d1895900b50b40a896 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 3 May 2010 19:44:32 +0200 Subject: [PATCH 18/24] oprofile/x86: fix uninitialized counter usage during cpu hotplug This fixes a NULL pointer dereference that is triggered when taking a cpu offline after oprofile was initialized, e.g.: $ opcontrol --init $ opcontrol --start-daemon $ opcontrol --shutdown $ opcontrol --deinit $ echo 0 > /sys/devices/system/cpu/cpu1/online See the crash dump below. Though the counter has been disabled the cpu notifier is still active and trying to use already freed counter data. This fix is for linux-stable. To proper fix this, the hotplug code must be rewritten. Thus I will leave a WARN_ON_ONCE() message with this patch. BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] op_amd_stop+0x2d/0x8e PGD 0 Oops: 0000 [#1] SMP last sysfs file: /sys/devices/system/cpu/cpu1/online CPU 1 Modules linked in: Pid: 0, comm: swapper Not tainted 2.6.34-rc5-oprofile-x86_64-standard-00210-g8c00f06 #16 Anaheim/Anaheim RIP: 0010:[] [] op_amd_stop+0x2d/0x8e RSP: 0018:ffff880001843f28 EFLAGS: 00010006 RAX: 0000000000000000 RBX: 0000000000000000 RCX: dead000000200200 RDX: ffff880001843f68 RSI: dead000000100100 RDI: 0000000000000000 RBP: ffff880001843f48 R08: 0000000000000000 R09: ffff880001843f08 R10: ffffffff8102c9a5 R11: ffff88000184ea80 R12: 0000000000000000 R13: ffff88000184f6c0 R14: 0000000000000000 R15: 0000000000000000 FS: 00007fec6a92e6f0(0000) GS:ffff880001840000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000000 CR3: 000000000163b000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process swapper (pid: 0, threadinfo ffff88042fcd8000, task ffff88042fcd51d0) Stack: ffff880001843f48 0000000000000001 ffff88042e9f7d38 ffff880001843f68 <0> ffff880001843f58 ffffffff8132a602 ffff880001843f98 ffffffff810521b3 <0> ffff880001843f68 ffff880001843f68 ffff880001843f88 ffff88042fcd9fd8 Call Trace: [] nmi_cpu_stop+0x21/0x23 [] generic_smp_call_function_single_interrupt+0xdf/0x11b [] smp_call_function_single_interrupt+0x22/0x31 [] call_function_single_interrupt+0x13/0x20 [] ? wake_up_process+0x10/0x12 [] ? default_idle+0x22/0x37 [] c1e_idle+0xdf/0xe6 [] ? atomic_notifier_call_chain+0x13/0x15 [] cpu_idle+0x4b/0x7e [] start_secondary+0x1ae/0x1b2 Code: 89 e5 41 55 49 89 fd 41 54 45 31 e4 53 31 db 48 83 ec 08 89 df e8 be f8 ff ff 48 98 48 83 3c c5 10 67 7a 81 00 74 1f 49 8b 45 08 <42> 8b 0c 20 0f 32 48 c1 e2 20 25 ff ff bf ff 48 09 d0 48 89 c2 RIP [] op_amd_stop+0x2d/0x8e RSP CR2: 0000000000000000 ---[ end trace 679ac372d674b757 ]--- Kernel panic - not syncing: Fatal exception in interrupt Pid: 0, comm: swapper Tainted: G D 2.6.34-rc5-oprofile-x86_64-standard-00210-g8c00f06 #16 Call Trace: [] panic+0x9e/0x10c [] ? up+0x34/0x39 [] ? kmsg_dump+0x112/0x12c [] oops_end+0x81/0x8e [] no_context+0x1f3/0x202 [] __bad_area_nosemaphore+0x1ba/0x1e0 [] ? enqueue_task_fair+0x16d/0x17a [] ? activate_task+0x42/0x53 [] ? try_to_wake_up+0x272/0x284 [] bad_area_nosemaphore+0xe/0x10 [] do_page_fault+0x1c8/0x37c [] ? enqueue_task_fair+0x16d/0x17a [] page_fault+0x1f/0x30 [] ? wake_up_process+0x10/0x12 [] ? op_amd_stop+0x2d/0x8e [] ? op_amd_stop+0x1c/0x8e [] nmi_cpu_stop+0x21/0x23 [] generic_smp_call_function_single_interrupt+0xdf/0x11b [] smp_call_function_single_interrupt+0x22/0x31 [] call_function_single_interrupt+0x13/0x20 [] ? wake_up_process+0x10/0x12 [] ? default_idle+0x22/0x37 [] c1e_idle+0xdf/0xe6 [] ? atomic_notifier_call_chain+0x13/0x15 [] cpu_idle+0x4b/0x7e [] start_secondary+0x1ae/0x1b2 ------------[ cut here ]------------ WARNING: at /local/rrichter/.source/linux/arch/x86/kernel/smp.c:118 native_smp_send_reschedule+0x27/0x53() Hardware name: Anaheim Modules linked in: Pid: 0, comm: swapper Tainted: G D 2.6.34-rc5-oprofile-x86_64-standard-00210-g8c00f06 #16 Call Trace: [] ? native_smp_send_reschedule+0x27/0x53 [] warn_slowpath_common+0x77/0xa4 [] warn_slowpath_null+0xf/0x11 [] native_smp_send_reschedule+0x27/0x53 [] resched_task+0x60/0x62 [] check_preempt_curr_idle+0x10/0x12 [] try_to_wake_up+0x1f5/0x284 [] default_wake_function+0xd/0xf [] pollwake+0x57/0x5a [] ? default_wake_function+0x0/0xf [] __wake_up_common+0x46/0x75 [] __wake_up+0x38/0x50 [] printk_tick+0x39/0x3b [] update_process_times+0x3f/0x5c [] tick_periodic+0x5d/0x69 [] tick_handle_periodic+0x21/0x71 [] smp_apic_timer_interrupt+0x82/0x95 [] apic_timer_interrupt+0x13/0x20 [] ? panic_blink_one_second+0x0/0x7b [] ? panic+0x10a/0x10c [] ? up+0x34/0x39 [] ? kmsg_dump+0x112/0x12c [] ? oops_end+0x81/0x8e [] ? no_context+0x1f3/0x202 [] ? __bad_area_nosemaphore+0x1ba/0x1e0 [] ? enqueue_task_fair+0x16d/0x17a [] ? activate_task+0x42/0x53 [] ? try_to_wake_up+0x272/0x284 [] ? bad_area_nosemaphore+0xe/0x10 [] ? do_page_fault+0x1c8/0x37c [] ? enqueue_task_fair+0x16d/0x17a [] ? page_fault+0x1f/0x30 [] ? wake_up_process+0x10/0x12 [] ? op_amd_stop+0x2d/0x8e [] ? op_amd_stop+0x1c/0x8e [] ? nmi_cpu_stop+0x21/0x23 [] ? generic_smp_call_function_single_interrupt+0xdf/0x11b [] ? smp_call_function_single_interrupt+0x22/0x31 [] ? call_function_single_interrupt+0x13/0x20 [] ? wake_up_process+0x10/0x12 [] ? default_idle+0x22/0x37 [] ? c1e_idle+0xdf/0xe6 [] ? atomic_notifier_call_chain+0x13/0x15 [] ? cpu_idle+0x4b/0x7e [] ? start_secondary+0x1ae/0x1b2 ---[ end trace 679ac372d674b758 ]--- Cc: Andi Kleen Cc: stable Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 9f001d904599..24582040b718 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -95,7 +95,10 @@ static void nmi_cpu_save_registers(struct op_msrs *msrs) static void nmi_cpu_start(void *dummy) { struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); - model->start(msrs); + if (!msrs->controls) + WARN_ON_ONCE(1); + else + model->start(msrs); } static int nmi_start(void) @@ -107,7 +110,10 @@ static int nmi_start(void) static void nmi_cpu_stop(void *dummy) { struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); - model->stop(msrs); + if (!msrs->controls) + WARN_ON_ONCE(1); + else + model->stop(msrs); } static void nmi_stop(void) From 216f3d9b4e5121feea4b13fae9d4c83e8d7e1c8a Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 3 May 2010 11:58:46 +0200 Subject: [PATCH 19/24] oprofile/x86: remove CONFIG_SMP macros CPU notifier register functions also exist if CONFIG_SMP is disabled. This change is part of hotplug code rework and also necessary for later patches. Cc: Andi Kleen Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 24582040b718..c5df8ee76ee4 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -471,7 +471,6 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root) return 0; } -#ifdef CONFIG_SMP static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action, void *data) { @@ -491,7 +490,6 @@ static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action, static struct notifier_block oprofile_cpu_nb = { .notifier_call = oprofile_cpu_notifier }; -#endif #ifdef CONFIG_PM @@ -701,9 +699,8 @@ int __init op_nmi_init(struct oprofile_operations *ops) return -ENODEV; } -#ifdef CONFIG_SMP register_cpu_notifier(&oprofile_cpu_nb); -#endif + /* default values, can be overwritten by model */ ops->create_files = nmi_create_files; ops->setup = nmi_setup; @@ -732,9 +729,7 @@ void op_nmi_exit(void) { if (using_nmi) { exit_sysfs(); -#ifdef CONFIG_SMP unregister_cpu_notifier(&oprofile_cpu_nb); -#endif } if (model->exit) model->exit(); From 6ae56b55bc364bc2f2342f599b46581627ba22da Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 29 Apr 2010 14:55:55 +0200 Subject: [PATCH 20/24] oprofile/x86: protect cpu hotplug sections This patch reworks oprofile cpu hotplug code as follows: Introduce ctr_running variable to check, if counters are running or not. The state must be known for taking a cpu on or offline and when switching counters during counter multiplexing. Protect on_each_cpu() sections with get_online_cpus()/put_online_cpu() functions. This is necessary if notifiers or states are modified. Within these sections the cpu mask may not change. Switch only between counters in nmi_cpu_switch(), if counters are running. Otherwise the switch may restart a counter though they are disabled. Add nmi_cpu_setup() and nmi_cpu_shutdown() to cpu hotplug code. The function must also be called to avoid uninitialzed counter usage. Cc: Andi Kleen Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 52 ++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index c5df8ee76ee4..b56601eaf29d 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -31,8 +31,9 @@ static struct op_x86_model_spec *model; static DEFINE_PER_CPU(struct op_msrs, cpu_msrs); static DEFINE_PER_CPU(unsigned long, saved_lvtpc); -/* 0 == registered but off, 1 == registered and on */ -static int nmi_enabled = 0; +/* must be protected with get_online_cpus()/put_online_cpus(): */ +static int nmi_enabled; +static int ctr_running; struct op_counter_config counter_config[OP_MAX_COUNTER]; @@ -103,7 +104,10 @@ static void nmi_cpu_start(void *dummy) static int nmi_start(void) { + get_online_cpus(); on_each_cpu(nmi_cpu_start, NULL, 1); + ctr_running = 1; + put_online_cpus(); return 0; } @@ -118,7 +122,10 @@ static void nmi_cpu_stop(void *dummy) static void nmi_stop(void) { + get_online_cpus(); on_each_cpu(nmi_cpu_stop, NULL, 1); + ctr_running = 0; + put_online_cpus(); } #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX @@ -258,7 +265,10 @@ static int nmi_switch_event(void) if (nmi_multiplex_on() < 0) return -EINVAL; /* not necessary */ - on_each_cpu(nmi_cpu_switch, NULL, 1); + get_online_cpus(); + if (ctr_running) + on_each_cpu(nmi_cpu_switch, NULL, 1); + put_online_cpus(); return 0; } @@ -386,8 +396,11 @@ static int nmi_setup(void) if (err) goto fail; + get_online_cpus(); on_each_cpu(nmi_cpu_setup, NULL, 1); nmi_enabled = 1; + put_online_cpus(); + return 0; fail: free_msrs(); @@ -433,8 +446,11 @@ static void nmi_shutdown(void) { struct op_msrs *msrs; - nmi_enabled = 0; + get_online_cpus(); on_each_cpu(nmi_cpu_shutdown, NULL, 1); + nmi_enabled = 0; + ctr_running = 0; + put_online_cpus(); unregister_die_notifier(&profile_exceptions_nb); msrs = &get_cpu_var(cpu_msrs); model->shutdown(msrs); @@ -442,6 +458,22 @@ static void nmi_shutdown(void) put_cpu_var(cpu_msrs); } +static void nmi_cpu_up(void *dummy) +{ + if (nmi_enabled) + nmi_cpu_setup(dummy); + if (ctr_running) + nmi_cpu_start(dummy); +} + +static void nmi_cpu_down(void *dummy) +{ + if (ctr_running) + nmi_cpu_stop(dummy); + if (nmi_enabled) + nmi_cpu_shutdown(dummy); +} + static int nmi_create_files(struct super_block *sb, struct dentry *root) { unsigned int i; @@ -478,10 +510,10 @@ static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action, switch (action) { case CPU_DOWN_FAILED: case CPU_ONLINE: - smp_call_function_single(cpu, nmi_cpu_start, NULL, 0); + smp_call_function_single(cpu, nmi_cpu_up, NULL, 0); break; case CPU_DOWN_PREPARE: - smp_call_function_single(cpu, nmi_cpu_stop, NULL, 1); + smp_call_function_single(cpu, nmi_cpu_down, NULL, 1); break; } return NOTIFY_DONE; @@ -699,7 +731,11 @@ int __init op_nmi_init(struct oprofile_operations *ops) return -ENODEV; } + get_online_cpus(); register_cpu_notifier(&oprofile_cpu_nb); + nmi_enabled = 0; + ctr_running = 0; + put_online_cpus(); /* default values, can be overwritten by model */ ops->create_files = nmi_create_files; @@ -729,7 +765,11 @@ void op_nmi_exit(void) { if (using_nmi) { exit_sysfs(); + get_online_cpus(); unregister_cpu_notifier(&oprofile_cpu_nb); + nmi_enabled = 0; + ctr_running = 0; + put_online_cpus(); } if (model->exit) model->exit(); From de654649737696ecf32873c341b305e30f3dc777 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 3 May 2010 14:41:22 +0200 Subject: [PATCH 21/24] oprofile/x86: stop disabled counters in nmi handler This patch adds checks to the nmi handler. Now samples are only generated and counters reenabled, if the counters are running. Otherwise the counters are stopped, if oprofile is using the nmi. In other cases it will ignore the nmi notification. Cc: Andi Kleen Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index b56601eaf29d..94b5481bb6c6 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -62,12 +62,16 @@ static int profile_exceptions_notify(struct notifier_block *self, { struct die_args *args = (struct die_args *)data; int ret = NOTIFY_DONE; - int cpu = smp_processor_id(); switch (val) { case DIE_NMI: case DIE_NMI_IPI: - model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu)); + if (ctr_running) + model->check_ctrs(args->regs, &__get_cpu_var(cpu_msrs)); + else if (!nmi_enabled) + break; + else + model->stop(&__get_cpu_var(cpu_msrs)); ret = NOTIFY_STOP; break; default: @@ -392,6 +396,9 @@ static int nmi_setup(void) mux_clone(cpu); } + nmi_enabled = 0; + ctr_running = 0; + barrier(); err = register_die_notifier(&profile_exceptions_nb); if (err) goto fail; @@ -451,6 +458,7 @@ static void nmi_shutdown(void) nmi_enabled = 0; ctr_running = 0; put_online_cpus(); + barrier(); unregister_die_notifier(&profile_exceptions_nb); msrs = &get_cpu_var(cpu_msrs); model->shutdown(msrs); From d30d64c6da3ec7a0708bfffa7e05752d5b9a1093 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 3 May 2010 15:52:26 +0200 Subject: [PATCH 22/24] oprofile/x86: reordering some functions Reordering some functions. Necessary for the next patch. No functional changes. Cc: Andi Kleen Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 134 ++++++++++++++++++------------------ 1 file changed, 67 insertions(+), 67 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 94b5481bb6c6..7de0572b0a5e 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -364,56 +364,6 @@ static struct notifier_block profile_exceptions_nb = { .priority = 2 }; -static int nmi_setup(void) -{ - int err = 0; - int cpu; - - if (!allocate_msrs()) - return -ENOMEM; - - /* We need to serialize save and setup for HT because the subset - * of msrs are distinct for save and setup operations - */ - - /* Assume saved/restored counters are the same on all CPUs */ - err = model->fill_in_addresses(&per_cpu(cpu_msrs, 0)); - if (err) - goto fail; - - for_each_possible_cpu(cpu) { - if (!cpu) - continue; - - memcpy(per_cpu(cpu_msrs, cpu).counters, - per_cpu(cpu_msrs, 0).counters, - sizeof(struct op_msr) * model->num_counters); - - memcpy(per_cpu(cpu_msrs, cpu).controls, - per_cpu(cpu_msrs, 0).controls, - sizeof(struct op_msr) * model->num_controls); - - mux_clone(cpu); - } - - nmi_enabled = 0; - ctr_running = 0; - barrier(); - err = register_die_notifier(&profile_exceptions_nb); - if (err) - goto fail; - - get_online_cpus(); - on_each_cpu(nmi_cpu_setup, NULL, 1); - nmi_enabled = 1; - put_online_cpus(); - - return 0; -fail: - free_msrs(); - return err; -} - static void nmi_cpu_restore_registers(struct op_msrs *msrs) { struct op_msr *counters = msrs->counters; @@ -449,23 +399,6 @@ static void nmi_cpu_shutdown(void *dummy) nmi_cpu_restore_registers(msrs); } -static void nmi_shutdown(void) -{ - struct op_msrs *msrs; - - get_online_cpus(); - on_each_cpu(nmi_cpu_shutdown, NULL, 1); - nmi_enabled = 0; - ctr_running = 0; - put_online_cpus(); - barrier(); - unregister_die_notifier(&profile_exceptions_nb); - msrs = &get_cpu_var(cpu_msrs); - model->shutdown(msrs); - free_msrs(); - put_cpu_var(cpu_msrs); -} - static void nmi_cpu_up(void *dummy) { if (nmi_enabled) @@ -531,6 +464,73 @@ static struct notifier_block oprofile_cpu_nb = { .notifier_call = oprofile_cpu_notifier }; +static int nmi_setup(void) +{ + int err = 0; + int cpu; + + if (!allocate_msrs()) + return -ENOMEM; + + /* We need to serialize save and setup for HT because the subset + * of msrs are distinct for save and setup operations + */ + + /* Assume saved/restored counters are the same on all CPUs */ + err = model->fill_in_addresses(&per_cpu(cpu_msrs, 0)); + if (err) + goto fail; + + for_each_possible_cpu(cpu) { + if (!cpu) + continue; + + memcpy(per_cpu(cpu_msrs, cpu).counters, + per_cpu(cpu_msrs, 0).counters, + sizeof(struct op_msr) * model->num_counters); + + memcpy(per_cpu(cpu_msrs, cpu).controls, + per_cpu(cpu_msrs, 0).controls, + sizeof(struct op_msr) * model->num_controls); + + mux_clone(cpu); + } + + nmi_enabled = 0; + ctr_running = 0; + barrier(); + err = register_die_notifier(&profile_exceptions_nb); + if (err) + goto fail; + + get_online_cpus(); + on_each_cpu(nmi_cpu_setup, NULL, 1); + nmi_enabled = 1; + put_online_cpus(); + + return 0; +fail: + free_msrs(); + return err; +} + +static void nmi_shutdown(void) +{ + struct op_msrs *msrs; + + get_online_cpus(); + on_each_cpu(nmi_cpu_shutdown, NULL, 1); + nmi_enabled = 0; + ctr_running = 0; + put_online_cpus(); + barrier(); + unregister_die_notifier(&profile_exceptions_nb); + msrs = &get_cpu_var(cpu_msrs); + model->shutdown(msrs); + free_msrs(); + put_cpu_var(cpu_msrs); +} + #ifdef CONFIG_PM static int nmi_suspend(struct sys_device *dev, pm_message_t state) From 3de668ee8d5b1e08da3200f926ff5a28aeb99bc2 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 3 May 2010 15:00:25 +0200 Subject: [PATCH 23/24] oprofile/x86: notify cpus only when daemon is running This patch moves the cpu notifier registration from nmi_init() to nmi_setup(). The corresponding unregistration function is now in nmi_shutdown(). Thus, the hotplug code is only active, if the oprofile daemon is running. Cc: Andi Kleen Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 7de0572b0a5e..2a086726cad1 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -504,6 +504,7 @@ static int nmi_setup(void) goto fail; get_online_cpus(); + register_cpu_notifier(&oprofile_cpu_nb); on_each_cpu(nmi_cpu_setup, NULL, 1); nmi_enabled = 1; put_online_cpus(); @@ -519,6 +520,7 @@ static void nmi_shutdown(void) struct op_msrs *msrs; get_online_cpus(); + unregister_cpu_notifier(&oprofile_cpu_nb); on_each_cpu(nmi_cpu_shutdown, NULL, 1); nmi_enabled = 0; ctr_running = 0; @@ -739,12 +741,6 @@ int __init op_nmi_init(struct oprofile_operations *ops) return -ENODEV; } - get_online_cpus(); - register_cpu_notifier(&oprofile_cpu_nb); - nmi_enabled = 0; - ctr_running = 0; - put_online_cpus(); - /* default values, can be overwritten by model */ ops->create_files = nmi_create_files; ops->setup = nmi_setup; @@ -771,14 +767,8 @@ int __init op_nmi_init(struct oprofile_operations *ops) void op_nmi_exit(void) { - if (using_nmi) { + if (using_nmi) exit_sysfs(); - get_online_cpus(); - unregister_cpu_notifier(&oprofile_cpu_nb); - nmi_enabled = 0; - ctr_running = 0; - put_online_cpus(); - } if (model->exit) model->exit(); } From bae663bc635e2726c7c5228dbf0f2051e16d1c81 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 5 May 2010 17:47:17 +0200 Subject: [PATCH 24/24] oprofile/x86: make AMD IBS hotplug capable Current IBS code is not hotplug capable. An offline cpu might not be initialized or deinitialized properly. This patch fixes this by removing on_each_cpu() functions. The IBS init/deinit code is executed in the per-cpu functions model->setup_ctrs() and model->cpu_down() which are also called by hotplug notifiers. model->cpu_down() replaces model->exit() that became obsolete. Cc: Andi Kleen Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 4 +-- arch/x86/oprofile/op_model_amd.c | 54 ++++++++++---------------------- arch/x86/oprofile/op_x86_model.h | 2 +- 3 files changed, 19 insertions(+), 41 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 2a086726cad1..b28d2f1253bb 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -397,6 +397,8 @@ static void nmi_cpu_shutdown(void *dummy) apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu)); apic_write(APIC_LVTERR, v); nmi_cpu_restore_registers(msrs); + if (model->cpu_down) + model->cpu_down(); } static void nmi_cpu_up(void *dummy) @@ -769,6 +771,4 @@ void op_nmi_exit(void) { if (using_nmi) exit_sysfs(); - if (model->exit) - model->exit(); } diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 384c52410480..b67a6b5aa8d4 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -374,6 +374,15 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, val |= op_x86_get_ctrl(model, &counter_config[virt]); wrmsrl(msrs->controls[i].addr, val); } + + if (ibs_caps) + setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0); +} + +static void op_amd_cpu_shutdown(void) +{ + if (ibs_caps) + setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1); } static int op_amd_check_ctrs(struct pt_regs * const regs, @@ -436,28 +445,16 @@ static void op_amd_stop(struct op_msrs const * const msrs) op_amd_stop_ibs(); } -static u8 ibs_eilvt_off; - -static inline void apic_init_ibs_nmi_per_cpu(void *arg) -{ - ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0); -} - -static inline void apic_clear_ibs_nmi_per_cpu(void *arg) -{ - setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1); -} - -static int init_ibs_nmi(void) +static int __init_ibs_nmi(void) { #define IBSCTL_LVTOFFSETVAL (1 << 8) #define IBSCTL 0x1cc struct pci_dev *cpu_cfg; int nodes; u32 value = 0; + u8 ibs_eilvt_off; - /* per CPU setup */ - on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 1); + ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1); nodes = 0; cpu_cfg = NULL; @@ -487,21 +484,15 @@ static int init_ibs_nmi(void) return 0; } -/* uninitialize the APIC for the IBS interrupts if needed */ -static void clear_ibs_nmi(void) -{ - on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); -} - /* initialize the APIC for the IBS interrupts if available */ -static void ibs_init(void) +static void init_ibs(void) { ibs_caps = get_ibs_caps(); if (!ibs_caps) return; - if (init_ibs_nmi()) { + if (__init_ibs_nmi()) { ibs_caps = 0; return; } @@ -510,14 +501,6 @@ static void ibs_init(void) (unsigned)ibs_caps); } -static void ibs_exit(void) -{ - if (!ibs_caps) - return; - - clear_ibs_nmi(); -} - static int (*create_arch_files)(struct super_block *sb, struct dentry *root); static int setup_ibs_files(struct super_block *sb, struct dentry *root) @@ -566,17 +549,12 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root) static int op_amd_init(struct oprofile_operations *ops) { - ibs_init(); + init_ibs(); create_arch_files = ops->create_files; ops->create_files = setup_ibs_files; return 0; } -static void op_amd_exit(void) -{ - ibs_exit(); -} - struct op_x86_model_spec op_amd_spec = { .num_counters = NUM_COUNTERS, .num_controls = NUM_COUNTERS, @@ -584,9 +562,9 @@ struct op_x86_model_spec op_amd_spec = { .reserved = MSR_AMD_EVENTSEL_RESERVED, .event_mask = OP_EVENT_MASK, .init = op_amd_init, - .exit = op_amd_exit, .fill_in_addresses = &op_amd_fill_in_addresses, .setup_ctrs = &op_amd_setup_ctrs, + .cpu_down = &op_amd_cpu_shutdown, .check_ctrs = &op_amd_check_ctrs, .start = &op_amd_start, .stop = &op_amd_stop, diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 551401398fba..89017fa1fd63 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -40,10 +40,10 @@ struct op_x86_model_spec { u64 reserved; u16 event_mask; int (*init)(struct oprofile_operations *ops); - void (*exit)(void); int (*fill_in_addresses)(struct op_msrs * const msrs); void (*setup_ctrs)(struct op_x86_model_spec const *model, struct op_msrs const * const msrs); + void (*cpu_down)(void); int (*check_ctrs)(struct pt_regs * const regs, struct op_msrs const * const msrs); void (*start)(struct op_msrs const * const msrs);