Merge branch 'tip/perf/ringbuffer-2' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-2.6-trace into perf/urgent

This commit is contained in:
Ingo Molnar 2010-10-26 13:14:02 +02:00
commit 5c16d2c813
2 changed files with 172 additions and 175 deletions

View File

@ -62,18 +62,6 @@ enum ring_buffer_type {
unsigned ring_buffer_event_length(struct ring_buffer_event *event); unsigned ring_buffer_event_length(struct ring_buffer_event *event);
void *ring_buffer_event_data(struct ring_buffer_event *event); void *ring_buffer_event_data(struct ring_buffer_event *event);
/**
* ring_buffer_event_time_delta - return the delta timestamp of the event
* @event: the event to get the delta timestamp of
*
* The delta timestamp is the 27 bit timestamp since the last event.
*/
static inline unsigned
ring_buffer_event_time_delta(struct ring_buffer_event *event)
{
return event->time_delta;
}
/* /*
* ring_buffer_discard_commit will remove an event that has not * ring_buffer_discard_commit will remove an event that has not
* ben committed yet. If this is used, then ring_buffer_unlock_commit * ben committed yet. If this is used, then ring_buffer_unlock_commit

View File

@ -224,6 +224,9 @@ enum {
RB_LEN_TIME_STAMP = 16, RB_LEN_TIME_STAMP = 16,
}; };
#define skip_time_extend(event) \
((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
static inline int rb_null_event(struct ring_buffer_event *event) static inline int rb_null_event(struct ring_buffer_event *event)
{ {
return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
@ -248,8 +251,12 @@ rb_event_data_length(struct ring_buffer_event *event)
return length + RB_EVNT_HDR_SIZE; return length + RB_EVNT_HDR_SIZE;
} }
/* inline for ring buffer fast paths */ /*
static unsigned * Return the length of the given event. Will return
* the length of the time extend if the event is a
* time extend.
*/
static inline unsigned
rb_event_length(struct ring_buffer_event *event) rb_event_length(struct ring_buffer_event *event)
{ {
switch (event->type_len) { switch (event->type_len) {
@ -274,13 +281,41 @@ rb_event_length(struct ring_buffer_event *event)
return 0; return 0;
} }
/*
* Return total length of time extend and data,
* or just the event length for all other events.
*/
static inline unsigned
rb_event_ts_length(struct ring_buffer_event *event)
{
unsigned len = 0;
if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
/* time extends include the data event after it */
len = RB_LEN_TIME_EXTEND;
event = skip_time_extend(event);
}
return len + rb_event_length(event);
}
/** /**
* ring_buffer_event_length - return the length of the event * ring_buffer_event_length - return the length of the event
* @event: the event to get the length of * @event: the event to get the length of
*
* Returns the size of the data load of a data event.
* If the event is something other than a data event, it
* returns the size of the event itself. With the exception
* of a TIME EXTEND, where it still returns the size of the
* data load of the data event after it.
*/ */
unsigned ring_buffer_event_length(struct ring_buffer_event *event) unsigned ring_buffer_event_length(struct ring_buffer_event *event)
{ {
unsigned length = rb_event_length(event); unsigned length;
if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
event = skip_time_extend(event);
length = rb_event_length(event);
if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
return length; return length;
length -= RB_EVNT_HDR_SIZE; length -= RB_EVNT_HDR_SIZE;
@ -294,6 +329,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
static void * static void *
rb_event_data(struct ring_buffer_event *event) rb_event_data(struct ring_buffer_event *event)
{ {
if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
event = skip_time_extend(event);
BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
/* If length is in len field, then array[0] has the data */ /* If length is in len field, then array[0] has the data */
if (event->type_len) if (event->type_len)
@ -404,9 +441,6 @@ static inline int test_time_stamp(u64 delta)
/* Max payload is BUF_PAGE_SIZE - header (8bytes) */ /* Max payload is BUF_PAGE_SIZE - header (8bytes) */
#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
/* Max number of timestamps that can fit on a page */
#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND)
int ring_buffer_print_page_header(struct trace_seq *s) int ring_buffer_print_page_header(struct trace_seq *s)
{ {
struct buffer_data_page field; struct buffer_data_page field;
@ -1546,6 +1580,25 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
iter->head = 0; iter->head = 0;
} }
/* Slow path, do not inline */
static noinline struct ring_buffer_event *
rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
{
event->type_len = RINGBUF_TYPE_TIME_EXTEND;
/* Not the first event on the page? */
if (rb_event_index(event)) {
event->time_delta = delta & TS_MASK;
event->array[0] = delta >> TS_SHIFT;
} else {
/* nope, just zero it */
event->time_delta = 0;
event->array[0] = 0;
}
return skip_time_extend(event);
}
/** /**
* ring_buffer_update_event - update event type and data * ring_buffer_update_event - update event type and data
* @event: the even to update * @event: the even to update
@ -1558,28 +1611,31 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
* data field. * data field.
*/ */
static void static void
rb_update_event(struct ring_buffer_event *event, rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
unsigned type, unsigned length) struct ring_buffer_event *event, unsigned length,
int add_timestamp, u64 delta)
{ {
event->type_len = type; /* Only a commit updates the timestamp */
if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
delta = 0;
switch (type) { /*
* If we need to add a timestamp, then we
case RINGBUF_TYPE_PADDING: * add it to the start of the resevered space.
case RINGBUF_TYPE_TIME_EXTEND: */
case RINGBUF_TYPE_TIME_STAMP: if (unlikely(add_timestamp)) {
break; event = rb_add_time_stamp(event, delta);
length -= RB_LEN_TIME_EXTEND;
case 0: delta = 0;
length -= RB_EVNT_HDR_SIZE;
if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
event->array[0] = length;
else
event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
break;
default:
BUG();
} }
event->time_delta = delta;
length -= RB_EVNT_HDR_SIZE;
if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
event->type_len = 0;
event->array[0] = length;
} else
event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
} }
/* /*
@ -1823,10 +1879,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
local_sub(length, &tail_page->write); local_sub(length, &tail_page->write);
} }
static struct ring_buffer_event * /*
* This is the slow path, force gcc not to inline it.
*/
static noinline struct ring_buffer_event *
rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
unsigned long length, unsigned long tail, unsigned long length, unsigned long tail,
struct buffer_page *tail_page, u64 *ts) struct buffer_page *tail_page, u64 ts)
{ {
struct buffer_page *commit_page = cpu_buffer->commit_page; struct buffer_page *commit_page = cpu_buffer->commit_page;
struct ring_buffer *buffer = cpu_buffer->buffer; struct ring_buffer *buffer = cpu_buffer->buffer;
@ -1909,8 +1968,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
* Nested commits always have zero deltas, so * Nested commits always have zero deltas, so
* just reread the time stamp * just reread the time stamp
*/ */
*ts = rb_time_stamp(buffer); ts = rb_time_stamp(buffer);
next_page->page->time_stamp = *ts; next_page->page->time_stamp = ts;
} }
out_again: out_again:
@ -1929,12 +1988,21 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
static struct ring_buffer_event * static struct ring_buffer_event *
__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
unsigned type, unsigned long length, u64 *ts) unsigned long length, u64 ts,
u64 delta, int add_timestamp)
{ {
struct buffer_page *tail_page; struct buffer_page *tail_page;
struct ring_buffer_event *event; struct ring_buffer_event *event;
unsigned long tail, write; unsigned long tail, write;
/*
* If the time delta since the last event is too big to
* hold in the time field of the event, then we append a
* TIME EXTEND event ahead of the data event.
*/
if (unlikely(add_timestamp))
length += RB_LEN_TIME_EXTEND;
tail_page = cpu_buffer->tail_page; tail_page = cpu_buffer->tail_page;
write = local_add_return(length, &tail_page->write); write = local_add_return(length, &tail_page->write);
@ -1943,7 +2011,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
tail = write - length; tail = write - length;
/* See if we shot pass the end of this buffer page */ /* See if we shot pass the end of this buffer page */
if (write > BUF_PAGE_SIZE) if (unlikely(write > BUF_PAGE_SIZE))
return rb_move_tail(cpu_buffer, length, tail, return rb_move_tail(cpu_buffer, length, tail,
tail_page, ts); tail_page, ts);
@ -1951,18 +2019,16 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
event = __rb_page_index(tail_page, tail); event = __rb_page_index(tail_page, tail);
kmemcheck_annotate_bitfield(event, bitfield); kmemcheck_annotate_bitfield(event, bitfield);
rb_update_event(event, type, length); rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
/* The passed in type is zero for DATA */ local_inc(&tail_page->entries);
if (likely(!type))
local_inc(&tail_page->entries);
/* /*
* If this is the first commit on the page, then update * If this is the first commit on the page, then update
* its timestamp. * its timestamp.
*/ */
if (!tail) if (!tail)
tail_page->page->time_stamp = *ts; tail_page->page->time_stamp = ts;
return event; return event;
} }
@ -1977,7 +2043,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
unsigned long addr; unsigned long addr;
new_index = rb_event_index(event); new_index = rb_event_index(event);
old_index = new_index + rb_event_length(event); old_index = new_index + rb_event_ts_length(event);
addr = (unsigned long)event; addr = (unsigned long)event;
addr &= PAGE_MASK; addr &= PAGE_MASK;
@ -2003,76 +2069,13 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
return 0; return 0;
} }
static int
rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
u64 *ts, u64 *delta)
{
struct ring_buffer_event *event;
int ret;
WARN_ONCE(*delta > (1ULL << 59),
KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
(unsigned long long)*delta,
(unsigned long long)*ts,
(unsigned long long)cpu_buffer->write_stamp);
/*
* The delta is too big, we to add a
* new timestamp.
*/
event = __rb_reserve_next(cpu_buffer,
RINGBUF_TYPE_TIME_EXTEND,
RB_LEN_TIME_EXTEND,
ts);
if (!event)
return -EBUSY;
if (PTR_ERR(event) == -EAGAIN)
return -EAGAIN;
/* Only a commited time event can update the write stamp */
if (rb_event_is_commit(cpu_buffer, event)) {
/*
* If this is the first on the page, then it was
* updated with the page itself. Try to discard it
* and if we can't just make it zero.
*/
if (rb_event_index(event)) {
event->time_delta = *delta & TS_MASK;
event->array[0] = *delta >> TS_SHIFT;
} else {
/* try to discard, since we do not need this */
if (!rb_try_to_discard(cpu_buffer, event)) {
/* nope, just zero it */
event->time_delta = 0;
event->array[0] = 0;
}
}
cpu_buffer->write_stamp = *ts;
/* let the caller know this was the commit */
ret = 1;
} else {
/* Try to discard the event */
if (!rb_try_to_discard(cpu_buffer, event)) {
/* Darn, this is just wasted space */
event->time_delta = 0;
event->array[0] = 0;
}
ret = 0;
}
*delta = 0;
return ret;
}
static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
{ {
local_inc(&cpu_buffer->committing); local_inc(&cpu_buffer->committing);
local_inc(&cpu_buffer->commits); local_inc(&cpu_buffer->commits);
} }
static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
{ {
unsigned long commits; unsigned long commits;
@ -2110,9 +2113,10 @@ rb_reserve_next_event(struct ring_buffer *buffer,
unsigned long length) unsigned long length)
{ {
struct ring_buffer_event *event; struct ring_buffer_event *event;
u64 ts, delta = 0; u64 ts, delta;
int commit = 0;
int nr_loops = 0; int nr_loops = 0;
int add_timestamp;
u64 diff;
rb_start_commit(cpu_buffer); rb_start_commit(cpu_buffer);
@ -2133,6 +2137,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
length = rb_calculate_event_length(length); length = rb_calculate_event_length(length);
again: again:
add_timestamp = 0;
delta = 0;
/* /*
* We allow for interrupts to reenter here and do a trace. * We allow for interrupts to reenter here and do a trace.
* If one does, it will cause this original code to loop * If one does, it will cause this original code to loop
@ -2146,56 +2153,32 @@ rb_reserve_next_event(struct ring_buffer *buffer,
goto out_fail; goto out_fail;
ts = rb_time_stamp(cpu_buffer->buffer); ts = rb_time_stamp(cpu_buffer->buffer);
diff = ts - cpu_buffer->write_stamp;
/* /* make sure this diff is calculated here */
* Only the first commit can update the timestamp. barrier();
* Yes there is a race here. If an interrupt comes in
* just after the conditional and it traces too, then it
* will also check the deltas. More than one timestamp may
* also be made. But only the entry that did the actual
* commit will be something other than zero.
*/
if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
rb_page_write(cpu_buffer->tail_page) ==
rb_commit_index(cpu_buffer))) {
u64 diff;
diff = ts - cpu_buffer->write_stamp;
/* make sure this diff is calculated here */
barrier();
/* Did the write stamp get updated already? */
if (unlikely(ts < cpu_buffer->write_stamp))
goto get_event;
/* Did the write stamp get updated already? */
if (likely(ts >= cpu_buffer->write_stamp)) {
delta = diff; delta = diff;
if (unlikely(test_time_stamp(delta))) { if (unlikely(test_time_stamp(delta))) {
WARN_ONCE(delta > (1ULL << 59),
commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
if (commit == -EBUSY) (unsigned long long)delta,
goto out_fail; (unsigned long long)ts,
(unsigned long long)cpu_buffer->write_stamp);
if (commit == -EAGAIN) add_timestamp = 1;
goto again;
RB_WARN_ON(cpu_buffer, commit < 0);
} }
} }
get_event: event = __rb_reserve_next(cpu_buffer, length, ts,
event = __rb_reserve_next(cpu_buffer, 0, length, &ts); delta, add_timestamp);
if (unlikely(PTR_ERR(event) == -EAGAIN)) if (unlikely(PTR_ERR(event) == -EAGAIN))
goto again; goto again;
if (!event) if (!event)
goto out_fail; goto out_fail;
if (!rb_event_is_commit(cpu_buffer, event))
delta = 0;
event->time_delta = delta;
return event; return event;
out_fail: out_fail:
@ -2207,13 +2190,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
#define TRACE_RECURSIVE_DEPTH 16 #define TRACE_RECURSIVE_DEPTH 16
static int trace_recursive_lock(void) /* Keep this code out of the fast path cache */
static noinline void trace_recursive_fail(void)
{ {
current->trace_recursion++;
if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
return 0;
/* Disable all tracing before we do anything else */ /* Disable all tracing before we do anything else */
tracing_off_permanent(); tracing_off_permanent();
@ -2225,10 +2204,21 @@ static int trace_recursive_lock(void)
in_nmi()); in_nmi());
WARN_ON_ONCE(1); WARN_ON_ONCE(1);
}
static inline int trace_recursive_lock(void)
{
current->trace_recursion++;
if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
return 0;
trace_recursive_fail();
return -1; return -1;
} }
static void trace_recursive_unlock(void) static inline void trace_recursive_unlock(void)
{ {
WARN_ON_ONCE(!current->trace_recursion); WARN_ON_ONCE(!current->trace_recursion);
@ -2308,12 +2298,28 @@ static void
rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event) struct ring_buffer_event *event)
{ {
u64 delta;
/* /*
* The event first in the commit queue updates the * The event first in the commit queue updates the
* time stamp. * time stamp.
*/ */
if (rb_event_is_commit(cpu_buffer, event)) if (rb_event_is_commit(cpu_buffer, event)) {
cpu_buffer->write_stamp += event->time_delta; /*
* A commit event that is first on a page
* updates the write timestamp with the page stamp
*/
if (!rb_event_index(event))
cpu_buffer->write_stamp =
cpu_buffer->commit_page->page->time_stamp;
else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
delta = event->array[0];
delta <<= TS_SHIFT;
delta += event->time_delta;
cpu_buffer->write_stamp += delta;
} else
cpu_buffer->write_stamp += event->time_delta;
}
} }
static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
@ -2353,6 +2359,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
static inline void rb_event_discard(struct ring_buffer_event *event) static inline void rb_event_discard(struct ring_buffer_event *event)
{ {
if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
event = skip_time_extend(event);
/* array[0] holds the actual length for the discarded event */ /* array[0] holds the actual length for the discarded event */
event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
event->type_len = RINGBUF_TYPE_PADDING; event->type_len = RINGBUF_TYPE_PADDING;
@ -3049,12 +3058,12 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
again: again:
/* /*
* We repeat when a timestamp is encountered. It is possible * We repeat when a time extend is encountered.
* to get multiple timestamps from an interrupt entering just * Since the time extend is always attached to a data event,
* as one timestamp is about to be written, or from discarded * we should never loop more than once.
* commits. The most that we can have is the number on a single page. * (We never hit the following condition more than twice).
*/ */
if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
return NULL; return NULL;
reader = rb_get_reader_page(cpu_buffer); reader = rb_get_reader_page(cpu_buffer);
@ -3130,14 +3139,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
return NULL; return NULL;
/* /*
* We repeat when a timestamp is encountered. * We repeat when a time extend is encountered.
* We can get multiple timestamps by nested interrupts or also * Since the time extend is always attached to a data event,
* if filtering is on (discarding commits). Since discarding * we should never loop more than once.
* commits can be frequent we can get a lot of timestamps. * (We never hit the following condition more than twice).
* But we limit them by not adding timestamps if they begin
* at the start of a page.
*/ */
if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
return NULL; return NULL;
if (rb_per_cpu_empty(cpu_buffer)) if (rb_per_cpu_empty(cpu_buffer))
@ -3835,7 +3842,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
if (len > (commit - read)) if (len > (commit - read))
len = (commit - read); len = (commit - read);
size = rb_event_length(event); /* Always keep the time extend and data together */
size = rb_event_ts_length(event);
if (len < size) if (len < size)
goto out_unlock; goto out_unlock;
@ -3857,7 +3865,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
break; break;
event = rb_reader_event(cpu_buffer); event = rb_reader_event(cpu_buffer);
size = rb_event_length(event); /* Always keep the time extend and data together */
size = rb_event_ts_length(event);
} while (len > size); } while (len > size);
/* update bpage */ /* update bpage */