perf/core improvements and fixes:

New features:
 
 . Add support for pagefault tracing in 'trace', please see multiple examples
   in the changeset messages (Stanislav Fomichev).
 
 User visible:
 
 . Fallback to syscalls:* when raw_syscalls:* is not available in the perl and
   python perf scripts. (Daniel Bristot de Oliveira)
 
 Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v1
 
 iQIcBAABAgAGBQJTrIl8AAoJENZQFvNTUqpA6qYP/3vOXV303st9uQzIFMXmVGAp
 1ocQQaeIJ3t1z0pkgzeTuRqWKVlcEdcn6JixY+M4EYY3TpVfScvKd6qszAFzi/7j
 dJfszZi54h7AtXhknz7lGF8vP98DC0sCwJt/zCPQYN3Muc8HdrhuH2F9dGvxyvul
 W0OeIZhww5CsB1O+WuDeRcYpnj8vc+xMgTq5u8e/ywp7aKLRsL4bk4vPWB2YWWV7
 c7ml/CU6bglq4DhetBRUBFMvKRUix5td0jPLzKVU8QsR2ScnqNVIHd+rHG3wc/qS
 +bfxzWX4wXFrnrwo5536yvnfQMCNcHjvbJMp49tO/ubYWewvkgj7UI0wAoaCfDf9
 7eOFIY+9PavFxSyuBRA/RbnZRjBVBmnYEm/3810ZQagNsJUe7eyhL4j7a3oConk+
 gcHewKacepM1w7R7e1wH26V2OzU/7YAuyLKcWvmzLKdvC6SsRGPayA4gpSwV1HiU
 WkwE9SxM3EDHQYdfHZH4fytUW6mJKA73aVdqnsZdgDrTrgiBK2t+ZwUGJYPicTHj
 nGUvNCLHMy9Je+Ci7tJ1yq4RJ5CMDSVDZkXX5ViZl6fnMTriruhr07U6HgndfuvG
 iPV5xYeqvPupJAtGyNRk7gXAyw96cryp+dluC6Vv+wbp/UB2VqDbpLR4w4fp3C22
 7KDaC+efxtJnKu1hbcnN
 =2zsL
 -----END PGP SIGNATURE-----

Merge tag 'perf-core-for-mingo' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/core

Pull perf/core improvements and fixes from Arnaldo Carvalho de Melo:

New features:

  * Add support for pagefault tracing in 'trace', please see multiple examples
    in the changeset messages (Stanislav Fomichev).

User visible changes:

  * Fallback to syscalls:* when raw_syscalls:* is not available in the perl and
    python perf scripts. (Daniel Bristot de Oliveira)

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Ingo Molnar 2014-07-05 11:28:13 +02:00
commit 432100ba2a
12 changed files with 279 additions and 42 deletions

View File

@ -107,6 +107,52 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
Show tool stats such as number of times fd->pathname was discovered thru
hooking the open syscall return + vfs_getname or via reading /proc/pid/fd, etc.
-F=[all|min|maj]::
--pf=[all|min|maj]::
Trace pagefaults. Optionally, you can specify whether you want minor,
major or all pagefaults. Default value is maj.
--syscalls::
Trace system calls. This options is enabled by default.
PAGEFAULTS
----------
When tracing pagefaults, the format of the trace is as follows:
<min|maj>fault [<ip.symbol>+<ip.offset>] => <addr.dso@addr.offset> (<map type><addr level>).
- min/maj indicates whether fault event is minor or major;
- ip.symbol shows symbol for instruction pointer (the code that generated the
fault); if no debug symbols available, perf trace will print raw IP;
- addr.dso shows DSO for the faulted address;
- map type is either 'd' for non-executable maps or 'x' for executable maps;
- addr level is either 'k' for kernel dso or '.' for user dso.
For symbols resolution you may need to install debugging symbols.
Please be aware that duration is currently always 0 and doesn't reflect actual
time it took for fault to be handled!
When --verbose specified, perf trace tries to print all available information
for both IP and fault address in the form of dso@symbol+offset.
EXAMPLES
--------
Trace only major pagefaults:
$ perf trace --no-syscalls -F
Trace syscalls, major and minor pagefaults:
$ perf trace -F all
1416.547 ( 0.000 ms): python/20235 majfault [CRYPTO_push_info_+0x0] => /lib/x86_64-linux-gnu/libcrypto.so.1.0.0@0x61be0 (x.)
As you can see, there was major pagefault in python process, from
CRYPTO_push_info_ routine which faulted somewhere in libcrypto.so.
SEE ALSO
--------
linkperf:perf-record[1], linkperf:perf-script[1]

View File

@ -1178,6 +1178,9 @@ fail:
return NULL;
}
#define TRACE_PFMAJ (1 << 0)
#define TRACE_PFMIN (1 << 1)
struct trace {
struct perf_tool tool;
struct {
@ -1212,6 +1215,8 @@ struct trace {
bool summary_only;
bool show_comm;
bool show_tool_stats;
bool trace_syscalls;
int trace_pgfaults;
};
static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
@ -1538,6 +1543,7 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
}
typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
union perf_event *event,
struct perf_sample *sample);
static struct syscall *trace__syscall_info(struct trace *trace,
@ -1610,6 +1616,7 @@ static void thread__update_stats(struct thread_trace *ttrace,
}
static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
union perf_event *event __maybe_unused,
struct perf_sample *sample)
{
char *msg;
@ -1658,6 +1665,7 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
}
static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
union perf_event *event __maybe_unused,
struct perf_sample *sample)
{
int ret;
@ -1735,6 +1743,7 @@ out:
}
static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
union perf_event *event __maybe_unused,
struct perf_sample *sample)
{
trace->last_vfs_getname = perf_evsel__rawptr(evsel, sample, "pathname");
@ -1742,6 +1751,7 @@ static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
}
static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
union perf_event *event __maybe_unused,
struct perf_sample *sample)
{
u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
@ -1768,6 +1778,68 @@ out_dump:
return 0;
}
static void print_location(FILE *f, struct perf_sample *sample,
struct addr_location *al,
bool print_dso, bool print_sym)
{
if ((verbose || print_dso) && al->map)
fprintf(f, "%s@", al->map->dso->long_name);
if ((verbose || print_sym) && al->sym)
fprintf(f, "%s+0x%lx", al->sym->name,
al->addr - al->sym->start);
else if (al->map)
fprintf(f, "0x%lx", al->addr);
else
fprintf(f, "0x%lx", sample->addr);
}
static int trace__pgfault(struct trace *trace,
struct perf_evsel *evsel,
union perf_event *event,
struct perf_sample *sample)
{
struct thread *thread;
u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
struct addr_location al;
char map_type = 'd';
thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
thread__find_addr_location(thread, trace->host, cpumode, MAP__FUNCTION,
sample->ip, &al);
trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
fprintf(trace->output, "%sfault [",
evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
"maj" : "min");
print_location(trace->output, sample, &al, false, true);
fprintf(trace->output, "] => ");
thread__find_addr_location(thread, trace->host, cpumode, MAP__VARIABLE,
sample->addr, &al);
if (!al.map) {
thread__find_addr_location(thread, trace->host, cpumode,
MAP__FUNCTION, sample->addr, &al);
if (al.map)
map_type = 'x';
else
map_type = '?';
}
print_location(trace->output, sample, &al, true, false);
fprintf(trace->output, " (%c%c)\n", map_type, al.level);
return 0;
}
static bool skip_sample(struct trace *trace, struct perf_sample *sample)
{
if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
@ -1781,7 +1853,7 @@ static bool skip_sample(struct trace *trace, struct perf_sample *sample)
}
static int trace__process_sample(struct perf_tool *tool,
union perf_event *event __maybe_unused,
union perf_event *event,
struct perf_sample *sample,
struct perf_evsel *evsel,
struct machine *machine __maybe_unused)
@ -1799,7 +1871,7 @@ static int trace__process_sample(struct perf_tool *tool,
if (handler) {
++trace->nr_events;
handler(trace, evsel, sample);
handler(trace, evsel, event, sample);
}
return err;
@ -1826,7 +1898,7 @@ static int parse_target_str(struct trace *trace)
return 0;
}
static int trace__record(int argc, const char **argv)
static int trace__record(struct trace *trace, int argc, const char **argv)
{
unsigned int rec_argc, i, j;
const char **rec_argv;
@ -1835,34 +1907,54 @@ static int trace__record(int argc, const char **argv)
"-R",
"-m", "1024",
"-c", "1",
"-e",
};
const char * const sc_args[] = { "-e", };
unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
const char * const majpf_args[] = { "-e", "major-faults" };
unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
const char * const minpf_args[] = { "-e", "minor-faults" };
unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
/* +1 is for the event string below */
rec_argc = ARRAY_SIZE(record_args) + 1 + argc;
rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
majpf_args_nr + minpf_args_nr + argc;
rec_argv = calloc(rec_argc + 1, sizeof(char *));
if (rec_argv == NULL)
return -ENOMEM;
j = 0;
for (i = 0; i < ARRAY_SIZE(record_args); i++)
rec_argv[i] = record_args[i];
rec_argv[j++] = record_args[i];
/* event string may be different for older kernels - e.g., RHEL6 */
if (is_valid_tracepoint("raw_syscalls:sys_enter"))
rec_argv[i] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
else if (is_valid_tracepoint("syscalls:sys_enter"))
rec_argv[i] = "syscalls:sys_enter,syscalls:sys_exit";
else {
pr_err("Neither raw_syscalls nor syscalls events exist.\n");
return -1;
if (trace->trace_syscalls) {
for (i = 0; i < sc_args_nr; i++)
rec_argv[j++] = sc_args[i];
/* event string may be different for older kernels - e.g., RHEL6 */
if (is_valid_tracepoint("raw_syscalls:sys_enter"))
rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
else if (is_valid_tracepoint("syscalls:sys_enter"))
rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
else {
pr_err("Neither raw_syscalls nor syscalls events exist.\n");
return -1;
}
}
i++;
for (j = 0; j < (unsigned int)argc; j++, i++)
rec_argv[i] = argv[j];
if (trace->trace_pgfaults & TRACE_PFMAJ)
for (i = 0; i < majpf_args_nr; i++)
rec_argv[j++] = majpf_args[i];
return cmd_record(i, rec_argv, NULL);
if (trace->trace_pgfaults & TRACE_PFMIN)
for (i = 0; i < minpf_args_nr; i++)
rec_argv[j++] = minpf_args[i];
for (i = 0; i < (unsigned int)argc; i++)
rec_argv[j++] = argv[i];
return cmd_record(j, rec_argv, NULL);
}
static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
@ -1882,6 +1974,30 @@ static void perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
perf_evlist__add(evlist, evsel);
}
static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
u64 config)
{
struct perf_evsel *evsel;
struct perf_event_attr attr = {
.type = PERF_TYPE_SOFTWARE,
.mmap_data = 1,
.sample_period = 1,
};
attr.config = config;
event_attr_init(&attr);
evsel = perf_evsel__new(&attr);
if (!evsel)
return -ENOMEM;
evsel->handler = trace__pgfault;
perf_evlist__add(evlist, evsel);
return 0;
}
static int trace__run(struct trace *trace, int argc, const char **argv)
{
struct perf_evlist *evlist = perf_evlist__new();
@ -1897,10 +2013,21 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
goto out;
}
if (perf_evlist__add_syscall_newtp(evlist, trace__sys_enter, trace__sys_exit))
if (trace->trace_syscalls &&
perf_evlist__add_syscall_newtp(evlist, trace__sys_enter,
trace__sys_exit))
goto out_error_tp;
perf_evlist__add_vfs_getname(evlist);
if (trace->trace_syscalls)
perf_evlist__add_vfs_getname(evlist);
if ((trace->trace_pgfaults & TRACE_PFMAJ) &&
perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MAJ))
goto out_error_tp;
if ((trace->trace_pgfaults & TRACE_PFMIN) &&
perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MIN))
goto out_error_tp;
if (trace->sched &&
perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
@ -1982,7 +2109,8 @@ again:
goto next_event;
}
if (sample.raw_data == NULL) {
if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
sample.raw_data == NULL) {
fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
perf_evsel__name(evsel), sample.tid,
sample.cpu, sample.raw_size);
@ -1990,7 +2118,7 @@ again:
}
handler = evsel->handler;
handler(trace, evsel, &sample);
handler(trace, evsel, event, &sample);
next_event:
perf_evlist__mmap_consume(evlist, i);
@ -2093,13 +2221,10 @@ static int trace__replay(struct trace *trace)
if (evsel == NULL)
evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
"syscalls:sys_enter");
if (evsel == NULL) {
pr_err("Data file does not have raw_syscalls:sys_enter event\n");
goto out;
}
if (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
perf_evsel__init_sc_tp_ptr_field(evsel, args)) {
if (evsel &&
(perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
pr_err("Error during initialize raw_syscalls:sys_enter event\n");
goto out;
}
@ -2109,15 +2234,19 @@ static int trace__replay(struct trace *trace)
if (evsel == NULL)
evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
"syscalls:sys_exit");
if (evsel == NULL) {
pr_err("Data file does not have raw_syscalls:sys_exit event\n");
if (evsel &&
(perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
pr_err("Error during initialize raw_syscalls:sys_exit event\n");
goto out;
}
if (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
perf_evsel__init_sc_tp_uint_field(evsel, ret)) {
pr_err("Error during initialize raw_syscalls:sys_exit event\n");
goto out;
evlist__for_each(session->evlist, evsel) {
if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
(evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
evsel->handler = trace__pgfault;
}
err = parse_target_str(trace);
@ -2264,6 +2393,23 @@ static int trace__open_output(struct trace *trace, const char *filename)
return trace->output == NULL ? -errno : 0;
}
static int parse_pagefaults(const struct option *opt, const char *str,
int unset __maybe_unused)
{
int *trace_pgfaults = opt->value;
if (strcmp(str, "all") == 0)
*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
else if (strcmp(str, "maj") == 0)
*trace_pgfaults |= TRACE_PFMAJ;
else if (strcmp(str, "min") == 0)
*trace_pgfaults |= TRACE_PFMIN;
else
return -1;
return 0;
}
int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
{
const char * const trace_usage[] = {
@ -2293,6 +2439,7 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
},
.output = stdout,
.show_comm = true,
.trace_syscalls = true,
};
const char *output_name = NULL;
const char *ev_qualifier_str = NULL;
@ -2330,20 +2477,34 @@ int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
"Show only syscall summary with statistics"),
OPT_BOOLEAN('S', "with-summary", &trace.summary,
"Show all syscalls and summary with statistics"),
OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
"Trace pagefaults", parse_pagefaults, "maj"),
OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
OPT_END()
};
int err;
char bf[BUFSIZ];
if ((argc > 1) && (strcmp(argv[1], "record") == 0))
return trace__record(argc-2, &argv[2]);
argc = parse_options(argc, argv, trace_options, trace_usage,
PARSE_OPT_STOP_AT_NON_OPTION);
argc = parse_options(argc, argv, trace_options, trace_usage, 0);
if (trace.trace_pgfaults) {
trace.opts.sample_address = true;
trace.opts.sample_time = true;
}
if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
return trace__record(&trace, argc-1, &argv[1]);
/* summary_only implies summary option, but don't overwrite summary if set */
if (trace.summary_only)
trace.summary = trace.summary_only;
if (!trace.trace_syscalls && !trace.trace_pgfaults) {
pr_err("Please specify something to trace.\n");
return -1;
}
if (output_name != NULL) {
err = trace__open_output(&trace, output_name);
if (err < 0) {

View File

@ -1,2 +1,3 @@
#!/bin/bash
perf record -e raw_syscalls:sys_exit $@
(perf record -e raw_syscalls:sys_exit $@ || \
perf record -e syscalls:sys_exit $@) 2> /dev/null

View File

@ -26,6 +26,11 @@ sub raw_syscalls::sys_exit
}
}
sub syscalls::sys_exit
{
raw_syscalls::sys_exit(@_)
}
sub trace_end
{
printf("\nfailed syscalls by comm:\n\n");

View File

@ -1,2 +1,3 @@
#!/bin/bash
perf record -e raw_syscalls:sys_exit $@
(perf record -e raw_syscalls:sys_exit $@ || \
perf record -e syscalls:sys_exit $@) 2> /dev/null

View File

@ -1,2 +1,3 @@
#!/bin/bash
perf record -e raw_syscalls:sys_enter $@
(perf record -e raw_syscalls:sys_enter $@ || \
perf record -e syscalls:sys_enter $@) 2> /dev/null

View File

@ -1,2 +1,3 @@
#!/bin/bash
perf record -e raw_syscalls:sys_enter $@
(perf record -e raw_syscalls:sys_enter $@ || \
perf record -e syscalls:sys_enter $@) 2> /dev/null

View File

@ -1,2 +1,3 @@
#!/bin/bash
perf record -e raw_syscalls:sys_enter $@
(perf record -e raw_syscalls:sys_enter $@ || \
perf record -e syscalls:sys_enter $@) 2> /dev/null

View File

@ -50,6 +50,11 @@ def raw_syscalls__sys_exit(event_name, context, common_cpu,
except TypeError:
syscalls[common_comm][common_pid][id][ret] = 1
def syscalls__sys_exit(event_name, context, common_cpu,
common_secs, common_nsecs, common_pid, common_comm,
id, ret):
raw_syscalls__sys_exit(**locals())
def print_error_totals():
if for_comm is not None:
print "\nsyscall errors for %s:\n\n" % (for_comm),

View File

@ -53,6 +53,11 @@ def raw_syscalls__sys_enter(event_name, context, common_cpu,
except TypeError:
syscalls[id] = 1
def syscalls__sys_enter(event_name, context, common_cpu,
common_secs, common_nsecs, common_pid, common_comm,
id, args):
raw_syscalls__sys_enter(**locals())
def print_syscall_totals(interval):
while 1:
clear_term()

View File

@ -48,6 +48,11 @@ def raw_syscalls__sys_enter(event_name, context, common_cpu,
except TypeError:
syscalls[common_comm][common_pid][id] = 1
def syscalls__sys_enter(event_name, context, common_cpu,
common_secs, common_nsecs, common_pid, common_comm,
id, args):
raw_syscalls__sys_enter(**locals())
def print_syscall_totals():
if for_comm is not None:
print "\nsyscall events for %s:\n\n" % (for_comm),

View File

@ -44,6 +44,11 @@ def raw_syscalls__sys_enter(event_name, context, common_cpu,
except TypeError:
syscalls[id] = 1
def syscalls__sys_enter(event_name, context, common_cpu,
common_secs, common_nsecs, common_pid, common_comm,
id, args):
raw_syscalls__sys_enter(**locals())
def print_syscall_totals():
if for_comm is not None:
print "\nsyscall events for %s:\n\n" % (for_comm),