2019-01-16 19:10:59 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
2011-05-20 01:55:04 +08:00
|
|
|
/*
|
|
|
|
* Performance events ring-buffer code:
|
|
|
|
*
|
|
|
|
* Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
|
|
|
|
* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
|
2015-11-16 18:08:45 +08:00
|
|
|
* Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
|
2011-12-30 06:09:01 +08:00
|
|
|
* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
|
2011-05-20 01:55:04 +08:00
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/perf_event.h>
|
|
|
|
#include <linux/vmalloc.h>
|
|
|
|
#include <linux/slab.h>
|
2013-10-31 17:19:59 +08:00
|
|
|
#include <linux/circ_buf.h>
|
2015-01-29 01:54:38 +08:00
|
|
|
#include <linux/poll.h>
|
2018-04-20 20:03:18 +08:00
|
|
|
#include <linux/nospec.h>
|
2011-05-20 01:55:04 +08:00
|
|
|
|
|
|
|
#include "internal.h"
|
|
|
|
|
|
|
|
static void perf_output_wakeup(struct perf_output_handle *handle)
|
|
|
|
{
|
2018-02-12 06:34:03 +08:00
|
|
|
atomic_set(&handle->rb->poll, EPOLLIN);
|
2011-05-20 01:55:04 +08:00
|
|
|
|
2011-06-27 20:41:57 +08:00
|
|
|
handle->event->pending_wakeup = 1;
|
|
|
|
irq_work_queue(&handle->event->pending);
|
2011-05-20 01:55:04 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We need to ensure a later event_id doesn't publish a head when a former
|
|
|
|
* event isn't done writing. However since we need to deal with NMIs we
|
|
|
|
* cannot fully serialize things.
|
|
|
|
*
|
|
|
|
* We only publish the head (and generate a wakeup) when the outer-most
|
|
|
|
* event completes.
|
|
|
|
*/
|
|
|
|
static void perf_output_get_handle(struct perf_output_handle *handle)
|
|
|
|
{
|
2019-12-14 02:21:30 +08:00
|
|
|
struct perf_buffer *rb = handle->rb;
|
2011-05-20 01:55:04 +08:00
|
|
|
|
|
|
|
preempt_disable();
|
2019-05-17 19:52:34 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Avoid an explicit LOAD/STORE such that architectures with memops
|
|
|
|
* can use them.
|
|
|
|
*/
|
|
|
|
(*(volatile unsigned int *)&rb->nest)++;
|
2011-05-20 01:55:04 +08:00
|
|
|
handle->wakeup = local_read(&rb->wakeup);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void perf_output_put_handle(struct perf_output_handle *handle)
|
|
|
|
{
|
2019-12-14 02:21:30 +08:00
|
|
|
struct perf_buffer *rb = handle->rb;
|
2011-05-20 01:55:04 +08:00
|
|
|
unsigned long head;
|
2019-05-17 19:52:34 +08:00
|
|
|
unsigned int nest;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If this isn't the outermost nesting, we don't have to update
|
|
|
|
* @rb->user_page->data_head.
|
|
|
|
*/
|
|
|
|
nest = READ_ONCE(rb->nest);
|
|
|
|
if (nest > 1) {
|
|
|
|
WRITE_ONCE(rb->nest, nest - 1);
|
|
|
|
goto out;
|
|
|
|
}
|
2011-05-20 01:55:04 +08:00
|
|
|
|
|
|
|
again:
|
2019-05-17 19:52:32 +08:00
|
|
|
/*
|
|
|
|
* In order to avoid publishing a head value that goes backwards,
|
|
|
|
* we must ensure the load of @rb->head happens after we've
|
|
|
|
* incremented @rb->nest.
|
|
|
|
*
|
|
|
|
* Otherwise we can observe a @rb->head value before one published
|
|
|
|
* by an IRQ/NMI happening between the load and the increment.
|
|
|
|
*/
|
|
|
|
barrier();
|
2011-05-20 01:55:04 +08:00
|
|
|
head = local_read(&rb->head);
|
|
|
|
|
|
|
|
/*
|
2019-05-17 19:52:31 +08:00
|
|
|
* IRQ/NMI can happen here and advance @rb->head, causing our
|
|
|
|
* load above to be stale.
|
2011-05-20 01:55:04 +08:00
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
2013-10-28 20:55:29 +08:00
|
|
|
* Since the mmap() consumer (userspace) can run on a different CPU:
|
|
|
|
*
|
|
|
|
* kernel user
|
|
|
|
*
|
2013-11-25 18:49:10 +08:00
|
|
|
* if (LOAD ->data_tail) { LOAD ->data_head
|
|
|
|
* (A) smp_rmb() (C)
|
|
|
|
* STORE $data LOAD $data
|
|
|
|
* smp_wmb() (B) smp_mb() (D)
|
|
|
|
* STORE ->data_head STORE ->data_tail
|
|
|
|
* }
|
2013-10-28 20:55:29 +08:00
|
|
|
*
|
|
|
|
* Where A pairs with D, and B pairs with C.
|
|
|
|
*
|
2013-11-25 18:49:10 +08:00
|
|
|
* In our case (A) is a control dependency that separates the load of
|
|
|
|
* the ->data_tail and the stores of $data. In case ->data_tail
|
|
|
|
* indicates there is no room in the buffer to store $data we do not.
|
2013-10-28 20:55:29 +08:00
|
|
|
*
|
2013-11-25 18:49:10 +08:00
|
|
|
* D needs to be a full barrier since it separates the data READ
|
2013-10-28 20:55:29 +08:00
|
|
|
* from the tail WRITE.
|
|
|
|
*
|
|
|
|
* For B a WMB is sufficient since it separates two WRITEs, and for C
|
|
|
|
* an RMB is sufficient since it separates two READs.
|
|
|
|
*
|
|
|
|
* See perf_output_begin().
|
2011-05-20 01:55:04 +08:00
|
|
|
*/
|
2013-11-25 18:49:10 +08:00
|
|
|
smp_wmb(); /* B, matches C */
|
2019-05-17 19:52:33 +08:00
|
|
|
WRITE_ONCE(rb->user_page->data_head, head);
|
2011-05-20 01:55:04 +08:00
|
|
|
|
|
|
|
/*
|
2019-05-17 19:52:31 +08:00
|
|
|
* We must publish the head before decrementing the nest count,
|
|
|
|
* otherwise an IRQ/NMI can publish a more recent head value and our
|
|
|
|
* write will (temporarily) publish a stale value.
|
|
|
|
*/
|
|
|
|
barrier();
|
2019-05-17 19:52:34 +08:00
|
|
|
WRITE_ONCE(rb->nest, 0);
|
2019-05-17 19:52:31 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Ensure we decrement @rb->nest before we validate the @rb->head.
|
|
|
|
* Otherwise we cannot be sure we caught the 'last' nested update.
|
2011-05-20 01:55:04 +08:00
|
|
|
*/
|
2019-05-17 19:52:31 +08:00
|
|
|
barrier();
|
2011-05-20 01:55:04 +08:00
|
|
|
if (unlikely(head != local_read(&rb->head))) {
|
2019-05-17 19:52:34 +08:00
|
|
|
WRITE_ONCE(rb->nest, 1);
|
2011-05-20 01:55:04 +08:00
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (handle->wakeup != local_read(&rb->wakeup))
|
|
|
|
perf_output_wakeup(handle);
|
|
|
|
|
|
|
|
out:
|
|
|
|
preempt_enable();
|
|
|
|
}
|
|
|
|
|
2018-03-09 04:28:56 +08:00
|
|
|
static __always_inline bool
|
2016-03-28 14:41:31 +08:00
|
|
|
ring_buffer_has_space(unsigned long head, unsigned long tail,
|
|
|
|
unsigned long data_size, unsigned int size,
|
|
|
|
bool backward)
|
|
|
|
{
|
|
|
|
if (!backward)
|
|
|
|
return CIRC_SPACE(head, tail, data_size) >= size;
|
|
|
|
else
|
|
|
|
return CIRC_SPACE(tail, head, data_size) >= size;
|
|
|
|
}
|
|
|
|
|
2018-03-09 04:28:56 +08:00
|
|
|
static __always_inline int
|
2016-03-28 14:41:31 +08:00
|
|
|
__perf_output_begin(struct perf_output_handle *handle,
|
2020-10-30 22:50:32 +08:00
|
|
|
struct perf_sample_data *data,
|
2016-03-28 14:41:31 +08:00
|
|
|
struct perf_event *event, unsigned int size,
|
|
|
|
bool backward)
|
2011-05-20 01:55:04 +08:00
|
|
|
{
|
2019-12-14 02:21:30 +08:00
|
|
|
struct perf_buffer *rb;
|
2011-05-20 01:55:04 +08:00
|
|
|
unsigned long tail, offset, head;
|
2013-11-01 00:36:25 +08:00
|
|
|
int have_lost, page_shift;
|
2011-05-20 01:55:04 +08:00
|
|
|
struct {
|
|
|
|
struct perf_event_header header;
|
|
|
|
u64 id;
|
|
|
|
u64 lost;
|
|
|
|
} lost_event;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
/*
|
|
|
|
* For inherited events we send all the output towards the parent.
|
|
|
|
*/
|
|
|
|
if (event->parent)
|
|
|
|
event = event->parent;
|
|
|
|
|
|
|
|
rb = rcu_dereference(event->rb);
|
2013-11-01 00:20:25 +08:00
|
|
|
if (unlikely(!rb))
|
2011-05-20 01:55:04 +08:00
|
|
|
goto out;
|
|
|
|
|
2016-03-28 14:41:29 +08:00
|
|
|
if (unlikely(rb->paused)) {
|
|
|
|
if (rb->nr_pages)
|
|
|
|
local_inc(&rb->lost);
|
2011-05-20 01:55:04 +08:00
|
|
|
goto out;
|
2016-03-28 14:41:29 +08:00
|
|
|
}
|
2011-05-20 01:55:04 +08:00
|
|
|
|
2013-11-01 00:20:25 +08:00
|
|
|
handle->rb = rb;
|
|
|
|
handle->event = event;
|
|
|
|
|
2011-05-20 01:55:04 +08:00
|
|
|
have_lost = local_read(&rb->lost);
|
2013-11-01 00:20:25 +08:00
|
|
|
if (unlikely(have_lost)) {
|
2013-11-01 00:29:29 +08:00
|
|
|
size += sizeof(lost_event);
|
|
|
|
if (event->attr.sample_id_all)
|
|
|
|
size += event->id_header_size;
|
2011-05-20 01:55:04 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
perf_output_get_handle(handle);
|
|
|
|
|
|
|
|
do {
|
atomic: remove all traces of READ_ONCE_CTRL() and atomic*_read_ctrl()
This seems to be a mis-reading of how alpha memory ordering works, and
is not backed up by the alpha architecture manual. The helper functions
don't do anything special on any other architectures, and the arguments
that support them being safe on other architectures also argue that they
are safe on alpha.
Basically, the "control dependency" is between a previous read and a
subsequent write that is dependent on the value read. Even if the
subsequent write is actually done speculatively, there is no way that
such a speculative write could be made visible to other cpu's until it
has been committed, which requires validating the speculation.
Note that most weakely ordered architectures (very much including alpha)
do not guarantee any ordering relationship between two loads that depend
on each other on a control dependency:
read A
if (val == 1)
read B
because the conditional may be predicted, and the "read B" may be
speculatively moved up to before reading the value A. So we require the
user to insert a smp_rmb() between the two accesses to be correct:
read A;
if (A == 1)
smp_rmb()
read B
Alpha is further special in that it can break that ordering even if the
*address* of B depends on the read of A, because the cacheline that is
read later may be stale unless you have a memory barrier in between the
pointer read and the read of the value behind a pointer:
read ptr
read offset(ptr)
whereas all other weakly ordered architectures guarantee that the data
dependency (as opposed to just a control dependency) will order the two
accesses. As a result, alpha needs a "smp_read_barrier_depends()" in
between those two reads for them to be ordered.
The coontrol dependency that "READ_ONCE_CTRL()" and "atomic_read_ctrl()"
had was a control dependency to a subsequent *write*, however, and
nobody can finalize such a subsequent write without having actually done
the read. And were you to write such a value to a "stale" cacheline
(the way the unordered reads came to be), that would seem to lose the
write entirely.
So the things that make alpha able to re-order reads even more
aggressively than other weak architectures do not seem to be relevant
for a subsequent write. Alpha memory ordering may be strange, but
there's no real indication that it is *that* strange.
Also, the alpha architecture reference manual very explicitly talks
about the definition of "Dependence Constraints" in section 5.6.1.7,
where a preceding read dominates a subsequent write.
Such a dependence constraint admittedly does not impose a BEFORE (alpha
architecture term for globally visible ordering), but it does guarantee
that there can be no "causal loop". I don't see how you could avoid
such a loop if another cpu could see the stored value and then impact
the value of the first read. Put another way: the read and the write
could not be seen as being out of order wrt other cpus.
So I do not see how these "x_ctrl()" functions can currently be necessary.
I may have to eat my words at some point, but in the absense of clear
proof that alpha actually needs this, or indeed even an explanation of
how alpha could _possibly_ need it, I do not believe these functions are
called for.
And if it turns out that alpha really _does_ need a barrier for this
case, that barrier still should not be "smp_read_barrier_depends()".
We'd have to make up some new speciality barrier just for alpha, along
with the documentation for why it really is necessary.
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul E McKenney <paulmck@us.ibm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-04 09:22:17 +08:00
|
|
|
tail = READ_ONCE(rb->user_page->data_tail);
|
2011-05-20 01:55:04 +08:00
|
|
|
offset = head = local_read(&rb->head);
|
2016-03-28 14:41:31 +08:00
|
|
|
if (!rb->overwrite) {
|
|
|
|
if (unlikely(!ring_buffer_has_space(head, tail,
|
|
|
|
perf_data_size(rb),
|
|
|
|
size, backward)))
|
|
|
|
goto fail;
|
|
|
|
}
|
2013-11-25 18:49:10 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The above forms a control dependency barrier separating the
|
|
|
|
* @tail load above from the data stores below. Since the @tail
|
|
|
|
* load is required to compute the branch to fail below.
|
|
|
|
*
|
|
|
|
* A, matches D; the full memory barrier userspace SHOULD issue
|
|
|
|
* after reading the data and before storing the new tail
|
|
|
|
* position.
|
|
|
|
*
|
|
|
|
* See perf_output_put_handle().
|
|
|
|
*/
|
|
|
|
|
2016-03-28 14:41:31 +08:00
|
|
|
if (!backward)
|
|
|
|
head += size;
|
|
|
|
else
|
|
|
|
head -= size;
|
2011-05-20 01:55:04 +08:00
|
|
|
} while (local_cmpxchg(&rb->head, offset, head) != offset);
|
|
|
|
|
2016-03-28 14:41:31 +08:00
|
|
|
if (backward) {
|
|
|
|
offset = head;
|
|
|
|
head = (u64)(-head);
|
|
|
|
}
|
|
|
|
|
2013-11-01 00:25:38 +08:00
|
|
|
/*
|
2013-11-25 18:49:10 +08:00
|
|
|
* We rely on the implied barrier() by local_cmpxchg() to ensure
|
|
|
|
* none of the data stores below can be lifted up by the compiler.
|
2013-11-01 00:25:38 +08:00
|
|
|
*/
|
|
|
|
|
2013-11-01 00:20:25 +08:00
|
|
|
if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
|
2011-05-20 01:55:04 +08:00
|
|
|
local_add(rb->watermark, &rb->wakeup);
|
|
|
|
|
2013-11-01 00:36:25 +08:00
|
|
|
page_shift = PAGE_SHIFT + page_order(rb);
|
|
|
|
|
|
|
|
handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
|
|
|
|
offset &= (1UL << page_shift) - 1;
|
|
|
|
handle->addr = rb->data_pages[handle->page] + offset;
|
|
|
|
handle->size = (1UL << page_shift) - offset;
|
2011-05-20 01:55:04 +08:00
|
|
|
|
2013-11-01 00:20:25 +08:00
|
|
|
if (unlikely(have_lost)) {
|
2013-11-01 00:29:29 +08:00
|
|
|
lost_event.header.size = sizeof(lost_event);
|
2011-05-20 01:55:04 +08:00
|
|
|
lost_event.header.type = PERF_RECORD_LOST;
|
|
|
|
lost_event.header.misc = 0;
|
|
|
|
lost_event.id = event->id;
|
|
|
|
lost_event.lost = local_xchg(&rb->lost, 0);
|
|
|
|
|
2020-10-30 22:50:32 +08:00
|
|
|
/* XXX mostly redundant; @data is already fully initializes */
|
|
|
|
perf_event_header__init_id(&lost_event.header, data, event);
|
2011-05-20 01:55:04 +08:00
|
|
|
perf_output_put(handle, lost_event);
|
2020-10-30 22:50:32 +08:00
|
|
|
perf_event__output_id_sample(event, handle, data);
|
2011-05-20 01:55:04 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
fail:
|
|
|
|
local_inc(&rb->lost);
|
|
|
|
perf_output_put_handle(handle);
|
|
|
|
out:
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
return -ENOSPC;
|
|
|
|
}
|
|
|
|
|
perf/core: Add ::write_backward attribute to perf event
This patch introduces 'write_backward' bit to perf_event_attr, which
controls the direction of a ring buffer. After set, the corresponding
ring buffer is written from end to beginning. This feature is design to
support reading from overwritable ring buffer.
Ring buffer can be created by mapping a perf event fd. Kernel puts event
records into ring buffer, user tooling like perf fetch them from
address returned by mmap(). To prevent racing between kernel and tooling,
they communicate to each other through 'head' and 'tail' pointers.
Kernel maintains 'head' pointer, points it to the next free area (tail
of the last record). Tooling maintains 'tail' pointer, points it to the
tail of last consumed record (record has already been fetched). Kernel
determines the available space in a ring buffer using these two
pointers to avoid overwrite unfetched records.
By mapping without 'PROT_WRITE', an overwritable ring buffer is created.
Different from normal ring buffer, tooling is unable to maintain 'tail'
pointer because writing is forbidden. Therefore, for this type of ring
buffers, kernel overwrite old records unconditionally, works like flight
recorder. This feature would be useful if reading from overwritable ring
buffer were as easy as reading from normal ring buffer. However,
there's an obscure problem.
The following figure demonstrates a full overwritable ring buffer. In
this figure, the 'head' pointer points to the end of last record, and a
long record 'E' is pending. For a normal ring buffer, a 'tail' pointer
would have pointed to position (X), so kernel knows there's no more
space in the ring buffer. However, for an overwritable ring buffer,
kernel ignore the 'tail' pointer.
(X) head
. |
. V
+------+-------+----------+------+---+
|A....A|B.....B|C........C|D....D| |
+------+-------+----------+------+---+
Record 'A' is overwritten by event 'E':
head
|
V
+--+---+-------+----------+------+---+
|.E|..A|B.....B|C........C|D....D|E..|
+--+---+-------+----------+------+---+
Now tooling decides to read from this ring buffer. However, none of these
two natural positions, 'head' and the start of this ring buffer, are
pointing to the head of a record. Even the full ring buffer can be
accessed by tooling, it is unable to find a position to start decoding.
The first attempt tries to solve this problem AFAIK can be found from
[1]. It makes kernel to maintain 'tail' pointer: updates it when ring
buffer is half full. However, this approach introduces overhead to
fast path. Test result shows a 1% overhead [2]. In addition, this method
utilizes no more tham 50% records.
Another attempt can be found from [3], which allows putting the size of
an event at the end of each record. This approach allows tooling to find
records in a backward manner from 'head' pointer by reading size of a
record from its tail. However, because of alignment requirement, it
needs 8 bytes to record the size of a record, which is a huge waste. Its
performance is also not good, because more data need to be written.
This approach also introduces some extra branch instructions to fast
path.
'write_backward' is a better solution to this problem.
Following figure demonstrates the state of the overwritable ring buffer
when 'write_backward' is set before overwriting:
head
|
V
+---+------+----------+-------+------+
| |D....D|C........C|B.....B|A....A|
+---+------+----------+-------+------+
and after overwriting:
head
|
V
+---+------+----------+-------+---+--+
|..E|D....D|C........C|B.....B|A..|E.|
+---+------+----------+-------+---+--+
In each situation, 'head' points to the beginning of the newest record.
From this record, tooling can iterate over the full ring buffer and fetch
records one by one.
The only limitation that needs to be considered is back-to-back reading.
Due to the non-deterministic of user programs, it is impossible to ensure
the ring buffer keeps stable during reading. Consider an extreme situation:
tooling is scheduled out after reading record 'D', then a burst of events
come, eat up the whole ring buffer (one or multiple rounds). When the
tooling process comes back, reading after 'D' is incorrect now.
To prevent this problem, we need to find a way to ensure the ring buffer
is stable during reading. ioctl(PERF_EVENT_IOC_PAUSE_OUTPUT) is
suggested because its overhead is lower than
ioctl(PERF_EVENT_IOC_ENABLE).
By carefully verifying 'header' pointer, reader can avoid pausing the
ring-buffer. For example:
/* A union of all possible events */
union perf_event event;
p = head = perf_mmap__read_head();
while (true) {
/* copy header of next event */
fetch(&event.header, p, sizeof(event.header));
/* read 'head' pointer */
head = perf_mmap__read_head();
/* check overwritten: is the header good? */
if (!verify(sizeof(event.header), p, head))
break;
/* copy the whole event */
fetch(&event, p, event.header.size);
/* read 'head' pointer again */
head = perf_mmap__read_head();
/* is the whole event good? */
if (!verify(event.header.size, p, head))
break;
p += event.header.size;
}
However, the overhead is high because:
a) In-place decoding is not safe.
Copying-verifying-decoding is required.
b) Fetching 'head' pointer requires additional synchronization.
(From Alexei Starovoitov:
Even when this trick works, pause is needed for more than stability of
reading. When we collect the events into overwrite buffer we're waiting
for some other trigger (like all cpu utilization spike or just one cpu
running and all others are idle) and when it happens the buffer has
valuable info from the past. At this point new events are no longer
interesting and buffer should be paused, events read and unpaused until
next trigger comes.)
This patch utilizes event's default overflow_handler introduced
previously. perf_event_output_backward() is created as the default
overflow handler for backward ring buffers. To avoid extra overhead to
fast path, original perf_event_output() becomes __perf_event_output()
and marked '__always_inline'. In theory, there's no extra overhead
introduced to fast path.
Performance testing:
Calling 3000000 times of 'close(-1)', use gettimeofday() to check
duration. Use 'perf record -o /dev/null -e raw_syscalls:*' to capture
system calls. In ns.
Testing environment:
CPU : Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz
Kernel : v4.5.0
MEAN STDVAR
BASE 800214.950 2853.083
PRE1 2253846.700 9997.014
PRE2 2257495.540 8516.293
POST 2250896.100 8933.921
Where 'BASE' is pure performance without capturing. 'PRE1' is test
result of pure 'v4.5.0' kernel. 'PRE2' is test result before this
patch. 'POST' is test result after this patch. See [4] for the detailed
experimental setup.
Considering the stdvar, this patch doesn't introduce performance
overhead to the fast path.
[1] http://lkml.iu.edu/hypermail/linux/kernel/1304.1/04584.html
[2] http://lkml.iu.edu/hypermail/linux/kernel/1307.1/00535.html
[3] http://lkml.iu.edu/hypermail/linux/kernel/1512.0/01265.html
[4] http://lkml.kernel.org/g/56F89DCD.1040202@huawei.com
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: <acme@kernel.org>
Cc: <pi3orama@163.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Zefan Li <lizefan@huawei.com>
Link: http://lkml.kernel.org/r/1459865478-53413-1-git-send-email-wangnan0@huawei.com
[ Fixed the changelog some more. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-04-05 22:11:18 +08:00
|
|
|
int perf_output_begin_forward(struct perf_output_handle *handle,
|
2020-10-30 22:50:32 +08:00
|
|
|
struct perf_sample_data *data,
|
|
|
|
struct perf_event *event, unsigned int size)
|
perf/core: Add ::write_backward attribute to perf event
This patch introduces 'write_backward' bit to perf_event_attr, which
controls the direction of a ring buffer. After set, the corresponding
ring buffer is written from end to beginning. This feature is design to
support reading from overwritable ring buffer.
Ring buffer can be created by mapping a perf event fd. Kernel puts event
records into ring buffer, user tooling like perf fetch them from
address returned by mmap(). To prevent racing between kernel and tooling,
they communicate to each other through 'head' and 'tail' pointers.
Kernel maintains 'head' pointer, points it to the next free area (tail
of the last record). Tooling maintains 'tail' pointer, points it to the
tail of last consumed record (record has already been fetched). Kernel
determines the available space in a ring buffer using these two
pointers to avoid overwrite unfetched records.
By mapping without 'PROT_WRITE', an overwritable ring buffer is created.
Different from normal ring buffer, tooling is unable to maintain 'tail'
pointer because writing is forbidden. Therefore, for this type of ring
buffers, kernel overwrite old records unconditionally, works like flight
recorder. This feature would be useful if reading from overwritable ring
buffer were as easy as reading from normal ring buffer. However,
there's an obscure problem.
The following figure demonstrates a full overwritable ring buffer. In
this figure, the 'head' pointer points to the end of last record, and a
long record 'E' is pending. For a normal ring buffer, a 'tail' pointer
would have pointed to position (X), so kernel knows there's no more
space in the ring buffer. However, for an overwritable ring buffer,
kernel ignore the 'tail' pointer.
(X) head
. |
. V
+------+-------+----------+------+---+
|A....A|B.....B|C........C|D....D| |
+------+-------+----------+------+---+
Record 'A' is overwritten by event 'E':
head
|
V
+--+---+-------+----------+------+---+
|.E|..A|B.....B|C........C|D....D|E..|
+--+---+-------+----------+------+---+
Now tooling decides to read from this ring buffer. However, none of these
two natural positions, 'head' and the start of this ring buffer, are
pointing to the head of a record. Even the full ring buffer can be
accessed by tooling, it is unable to find a position to start decoding.
The first attempt tries to solve this problem AFAIK can be found from
[1]. It makes kernel to maintain 'tail' pointer: updates it when ring
buffer is half full. However, this approach introduces overhead to
fast path. Test result shows a 1% overhead [2]. In addition, this method
utilizes no more tham 50% records.
Another attempt can be found from [3], which allows putting the size of
an event at the end of each record. This approach allows tooling to find
records in a backward manner from 'head' pointer by reading size of a
record from its tail. However, because of alignment requirement, it
needs 8 bytes to record the size of a record, which is a huge waste. Its
performance is also not good, because more data need to be written.
This approach also introduces some extra branch instructions to fast
path.
'write_backward' is a better solution to this problem.
Following figure demonstrates the state of the overwritable ring buffer
when 'write_backward' is set before overwriting:
head
|
V
+---+------+----------+-------+------+
| |D....D|C........C|B.....B|A....A|
+---+------+----------+-------+------+
and after overwriting:
head
|
V
+---+------+----------+-------+---+--+
|..E|D....D|C........C|B.....B|A..|E.|
+---+------+----------+-------+---+--+
In each situation, 'head' points to the beginning of the newest record.
From this record, tooling can iterate over the full ring buffer and fetch
records one by one.
The only limitation that needs to be considered is back-to-back reading.
Due to the non-deterministic of user programs, it is impossible to ensure
the ring buffer keeps stable during reading. Consider an extreme situation:
tooling is scheduled out after reading record 'D', then a burst of events
come, eat up the whole ring buffer (one or multiple rounds). When the
tooling process comes back, reading after 'D' is incorrect now.
To prevent this problem, we need to find a way to ensure the ring buffer
is stable during reading. ioctl(PERF_EVENT_IOC_PAUSE_OUTPUT) is
suggested because its overhead is lower than
ioctl(PERF_EVENT_IOC_ENABLE).
By carefully verifying 'header' pointer, reader can avoid pausing the
ring-buffer. For example:
/* A union of all possible events */
union perf_event event;
p = head = perf_mmap__read_head();
while (true) {
/* copy header of next event */
fetch(&event.header, p, sizeof(event.header));
/* read 'head' pointer */
head = perf_mmap__read_head();
/* check overwritten: is the header good? */
if (!verify(sizeof(event.header), p, head))
break;
/* copy the whole event */
fetch(&event, p, event.header.size);
/* read 'head' pointer again */
head = perf_mmap__read_head();
/* is the whole event good? */
if (!verify(event.header.size, p, head))
break;
p += event.header.size;
}
However, the overhead is high because:
a) In-place decoding is not safe.
Copying-verifying-decoding is required.
b) Fetching 'head' pointer requires additional synchronization.
(From Alexei Starovoitov:
Even when this trick works, pause is needed for more than stability of
reading. When we collect the events into overwrite buffer we're waiting
for some other trigger (like all cpu utilization spike or just one cpu
running and all others are idle) and when it happens the buffer has
valuable info from the past. At this point new events are no longer
interesting and buffer should be paused, events read and unpaused until
next trigger comes.)
This patch utilizes event's default overflow_handler introduced
previously. perf_event_output_backward() is created as the default
overflow handler for backward ring buffers. To avoid extra overhead to
fast path, original perf_event_output() becomes __perf_event_output()
and marked '__always_inline'. In theory, there's no extra overhead
introduced to fast path.
Performance testing:
Calling 3000000 times of 'close(-1)', use gettimeofday() to check
duration. Use 'perf record -o /dev/null -e raw_syscalls:*' to capture
system calls. In ns.
Testing environment:
CPU : Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz
Kernel : v4.5.0
MEAN STDVAR
BASE 800214.950 2853.083
PRE1 2253846.700 9997.014
PRE2 2257495.540 8516.293
POST 2250896.100 8933.921
Where 'BASE' is pure performance without capturing. 'PRE1' is test
result of pure 'v4.5.0' kernel. 'PRE2' is test result before this
patch. 'POST' is test result after this patch. See [4] for the detailed
experimental setup.
Considering the stdvar, this patch doesn't introduce performance
overhead to the fast path.
[1] http://lkml.iu.edu/hypermail/linux/kernel/1304.1/04584.html
[2] http://lkml.iu.edu/hypermail/linux/kernel/1307.1/00535.html
[3] http://lkml.iu.edu/hypermail/linux/kernel/1512.0/01265.html
[4] http://lkml.kernel.org/g/56F89DCD.1040202@huawei.com
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: <acme@kernel.org>
Cc: <pi3orama@163.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Zefan Li <lizefan@huawei.com>
Link: http://lkml.kernel.org/r/1459865478-53413-1-git-send-email-wangnan0@huawei.com
[ Fixed the changelog some more. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-04-05 22:11:18 +08:00
|
|
|
{
|
2020-10-30 22:50:32 +08:00
|
|
|
return __perf_output_begin(handle, data, event, size, false);
|
perf/core: Add ::write_backward attribute to perf event
This patch introduces 'write_backward' bit to perf_event_attr, which
controls the direction of a ring buffer. After set, the corresponding
ring buffer is written from end to beginning. This feature is design to
support reading from overwritable ring buffer.
Ring buffer can be created by mapping a perf event fd. Kernel puts event
records into ring buffer, user tooling like perf fetch them from
address returned by mmap(). To prevent racing between kernel and tooling,
they communicate to each other through 'head' and 'tail' pointers.
Kernel maintains 'head' pointer, points it to the next free area (tail
of the last record). Tooling maintains 'tail' pointer, points it to the
tail of last consumed record (record has already been fetched). Kernel
determines the available space in a ring buffer using these two
pointers to avoid overwrite unfetched records.
By mapping without 'PROT_WRITE', an overwritable ring buffer is created.
Different from normal ring buffer, tooling is unable to maintain 'tail'
pointer because writing is forbidden. Therefore, for this type of ring
buffers, kernel overwrite old records unconditionally, works like flight
recorder. This feature would be useful if reading from overwritable ring
buffer were as easy as reading from normal ring buffer. However,
there's an obscure problem.
The following figure demonstrates a full overwritable ring buffer. In
this figure, the 'head' pointer points to the end of last record, and a
long record 'E' is pending. For a normal ring buffer, a 'tail' pointer
would have pointed to position (X), so kernel knows there's no more
space in the ring buffer. However, for an overwritable ring buffer,
kernel ignore the 'tail' pointer.
(X) head
. |
. V
+------+-------+----------+------+---+
|A....A|B.....B|C........C|D....D| |
+------+-------+----------+------+---+
Record 'A' is overwritten by event 'E':
head
|
V
+--+---+-------+----------+------+---+
|.E|..A|B.....B|C........C|D....D|E..|
+--+---+-------+----------+------+---+
Now tooling decides to read from this ring buffer. However, none of these
two natural positions, 'head' and the start of this ring buffer, are
pointing to the head of a record. Even the full ring buffer can be
accessed by tooling, it is unable to find a position to start decoding.
The first attempt tries to solve this problem AFAIK can be found from
[1]. It makes kernel to maintain 'tail' pointer: updates it when ring
buffer is half full. However, this approach introduces overhead to
fast path. Test result shows a 1% overhead [2]. In addition, this method
utilizes no more tham 50% records.
Another attempt can be found from [3], which allows putting the size of
an event at the end of each record. This approach allows tooling to find
records in a backward manner from 'head' pointer by reading size of a
record from its tail. However, because of alignment requirement, it
needs 8 bytes to record the size of a record, which is a huge waste. Its
performance is also not good, because more data need to be written.
This approach also introduces some extra branch instructions to fast
path.
'write_backward' is a better solution to this problem.
Following figure demonstrates the state of the overwritable ring buffer
when 'write_backward' is set before overwriting:
head
|
V
+---+------+----------+-------+------+
| |D....D|C........C|B.....B|A....A|
+---+------+----------+-------+------+
and after overwriting:
head
|
V
+---+------+----------+-------+---+--+
|..E|D....D|C........C|B.....B|A..|E.|
+---+------+----------+-------+---+--+
In each situation, 'head' points to the beginning of the newest record.
From this record, tooling can iterate over the full ring buffer and fetch
records one by one.
The only limitation that needs to be considered is back-to-back reading.
Due to the non-deterministic of user programs, it is impossible to ensure
the ring buffer keeps stable during reading. Consider an extreme situation:
tooling is scheduled out after reading record 'D', then a burst of events
come, eat up the whole ring buffer (one or multiple rounds). When the
tooling process comes back, reading after 'D' is incorrect now.
To prevent this problem, we need to find a way to ensure the ring buffer
is stable during reading. ioctl(PERF_EVENT_IOC_PAUSE_OUTPUT) is
suggested because its overhead is lower than
ioctl(PERF_EVENT_IOC_ENABLE).
By carefully verifying 'header' pointer, reader can avoid pausing the
ring-buffer. For example:
/* A union of all possible events */
union perf_event event;
p = head = perf_mmap__read_head();
while (true) {
/* copy header of next event */
fetch(&event.header, p, sizeof(event.header));
/* read 'head' pointer */
head = perf_mmap__read_head();
/* check overwritten: is the header good? */
if (!verify(sizeof(event.header), p, head))
break;
/* copy the whole event */
fetch(&event, p, event.header.size);
/* read 'head' pointer again */
head = perf_mmap__read_head();
/* is the whole event good? */
if (!verify(event.header.size, p, head))
break;
p += event.header.size;
}
However, the overhead is high because:
a) In-place decoding is not safe.
Copying-verifying-decoding is required.
b) Fetching 'head' pointer requires additional synchronization.
(From Alexei Starovoitov:
Even when this trick works, pause is needed for more than stability of
reading. When we collect the events into overwrite buffer we're waiting
for some other trigger (like all cpu utilization spike or just one cpu
running and all others are idle) and when it happens the buffer has
valuable info from the past. At this point new events are no longer
interesting and buffer should be paused, events read and unpaused until
next trigger comes.)
This patch utilizes event's default overflow_handler introduced
previously. perf_event_output_backward() is created as the default
overflow handler for backward ring buffers. To avoid extra overhead to
fast path, original perf_event_output() becomes __perf_event_output()
and marked '__always_inline'. In theory, there's no extra overhead
introduced to fast path.
Performance testing:
Calling 3000000 times of 'close(-1)', use gettimeofday() to check
duration. Use 'perf record -o /dev/null -e raw_syscalls:*' to capture
system calls. In ns.
Testing environment:
CPU : Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz
Kernel : v4.5.0
MEAN STDVAR
BASE 800214.950 2853.083
PRE1 2253846.700 9997.014
PRE2 2257495.540 8516.293
POST 2250896.100 8933.921
Where 'BASE' is pure performance without capturing. 'PRE1' is test
result of pure 'v4.5.0' kernel. 'PRE2' is test result before this
patch. 'POST' is test result after this patch. See [4] for the detailed
experimental setup.
Considering the stdvar, this patch doesn't introduce performance
overhead to the fast path.
[1] http://lkml.iu.edu/hypermail/linux/kernel/1304.1/04584.html
[2] http://lkml.iu.edu/hypermail/linux/kernel/1307.1/00535.html
[3] http://lkml.iu.edu/hypermail/linux/kernel/1512.0/01265.html
[4] http://lkml.kernel.org/g/56F89DCD.1040202@huawei.com
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: <acme@kernel.org>
Cc: <pi3orama@163.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Zefan Li <lizefan@huawei.com>
Link: http://lkml.kernel.org/r/1459865478-53413-1-git-send-email-wangnan0@huawei.com
[ Fixed the changelog some more. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-04-05 22:11:18 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
int perf_output_begin_backward(struct perf_output_handle *handle,
|
2020-10-30 22:50:32 +08:00
|
|
|
struct perf_sample_data *data,
|
perf/core: Add ::write_backward attribute to perf event
This patch introduces 'write_backward' bit to perf_event_attr, which
controls the direction of a ring buffer. After set, the corresponding
ring buffer is written from end to beginning. This feature is design to
support reading from overwritable ring buffer.
Ring buffer can be created by mapping a perf event fd. Kernel puts event
records into ring buffer, user tooling like perf fetch them from
address returned by mmap(). To prevent racing between kernel and tooling,
they communicate to each other through 'head' and 'tail' pointers.
Kernel maintains 'head' pointer, points it to the next free area (tail
of the last record). Tooling maintains 'tail' pointer, points it to the
tail of last consumed record (record has already been fetched). Kernel
determines the available space in a ring buffer using these two
pointers to avoid overwrite unfetched records.
By mapping without 'PROT_WRITE', an overwritable ring buffer is created.
Different from normal ring buffer, tooling is unable to maintain 'tail'
pointer because writing is forbidden. Therefore, for this type of ring
buffers, kernel overwrite old records unconditionally, works like flight
recorder. This feature would be useful if reading from overwritable ring
buffer were as easy as reading from normal ring buffer. However,
there's an obscure problem.
The following figure demonstrates a full overwritable ring buffer. In
this figure, the 'head' pointer points to the end of last record, and a
long record 'E' is pending. For a normal ring buffer, a 'tail' pointer
would have pointed to position (X), so kernel knows there's no more
space in the ring buffer. However, for an overwritable ring buffer,
kernel ignore the 'tail' pointer.
(X) head
. |
. V
+------+-------+----------+------+---+
|A....A|B.....B|C........C|D....D| |
+------+-------+----------+------+---+
Record 'A' is overwritten by event 'E':
head
|
V
+--+---+-------+----------+------+---+
|.E|..A|B.....B|C........C|D....D|E..|
+--+---+-------+----------+------+---+
Now tooling decides to read from this ring buffer. However, none of these
two natural positions, 'head' and the start of this ring buffer, are
pointing to the head of a record. Even the full ring buffer can be
accessed by tooling, it is unable to find a position to start decoding.
The first attempt tries to solve this problem AFAIK can be found from
[1]. It makes kernel to maintain 'tail' pointer: updates it when ring
buffer is half full. However, this approach introduces overhead to
fast path. Test result shows a 1% overhead [2]. In addition, this method
utilizes no more tham 50% records.
Another attempt can be found from [3], which allows putting the size of
an event at the end of each record. This approach allows tooling to find
records in a backward manner from 'head' pointer by reading size of a
record from its tail. However, because of alignment requirement, it
needs 8 bytes to record the size of a record, which is a huge waste. Its
performance is also not good, because more data need to be written.
This approach also introduces some extra branch instructions to fast
path.
'write_backward' is a better solution to this problem.
Following figure demonstrates the state of the overwritable ring buffer
when 'write_backward' is set before overwriting:
head
|
V
+---+------+----------+-------+------+
| |D....D|C........C|B.....B|A....A|
+---+------+----------+-------+------+
and after overwriting:
head
|
V
+---+------+----------+-------+---+--+
|..E|D....D|C........C|B.....B|A..|E.|
+---+------+----------+-------+---+--+
In each situation, 'head' points to the beginning of the newest record.
From this record, tooling can iterate over the full ring buffer and fetch
records one by one.
The only limitation that needs to be considered is back-to-back reading.
Due to the non-deterministic of user programs, it is impossible to ensure
the ring buffer keeps stable during reading. Consider an extreme situation:
tooling is scheduled out after reading record 'D', then a burst of events
come, eat up the whole ring buffer (one or multiple rounds). When the
tooling process comes back, reading after 'D' is incorrect now.
To prevent this problem, we need to find a way to ensure the ring buffer
is stable during reading. ioctl(PERF_EVENT_IOC_PAUSE_OUTPUT) is
suggested because its overhead is lower than
ioctl(PERF_EVENT_IOC_ENABLE).
By carefully verifying 'header' pointer, reader can avoid pausing the
ring-buffer. For example:
/* A union of all possible events */
union perf_event event;
p = head = perf_mmap__read_head();
while (true) {
/* copy header of next event */
fetch(&event.header, p, sizeof(event.header));
/* read 'head' pointer */
head = perf_mmap__read_head();
/* check overwritten: is the header good? */
if (!verify(sizeof(event.header), p, head))
break;
/* copy the whole event */
fetch(&event, p, event.header.size);
/* read 'head' pointer again */
head = perf_mmap__read_head();
/* is the whole event good? */
if (!verify(event.header.size, p, head))
break;
p += event.header.size;
}
However, the overhead is high because:
a) In-place decoding is not safe.
Copying-verifying-decoding is required.
b) Fetching 'head' pointer requires additional synchronization.
(From Alexei Starovoitov:
Even when this trick works, pause is needed for more than stability of
reading. When we collect the events into overwrite buffer we're waiting
for some other trigger (like all cpu utilization spike or just one cpu
running and all others are idle) and when it happens the buffer has
valuable info from the past. At this point new events are no longer
interesting and buffer should be paused, events read and unpaused until
next trigger comes.)
This patch utilizes event's default overflow_handler introduced
previously. perf_event_output_backward() is created as the default
overflow handler for backward ring buffers. To avoid extra overhead to
fast path, original perf_event_output() becomes __perf_event_output()
and marked '__always_inline'. In theory, there's no extra overhead
introduced to fast path.
Performance testing:
Calling 3000000 times of 'close(-1)', use gettimeofday() to check
duration. Use 'perf record -o /dev/null -e raw_syscalls:*' to capture
system calls. In ns.
Testing environment:
CPU : Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz
Kernel : v4.5.0
MEAN STDVAR
BASE 800214.950 2853.083
PRE1 2253846.700 9997.014
PRE2 2257495.540 8516.293
POST 2250896.100 8933.921
Where 'BASE' is pure performance without capturing. 'PRE1' is test
result of pure 'v4.5.0' kernel. 'PRE2' is test result before this
patch. 'POST' is test result after this patch. See [4] for the detailed
experimental setup.
Considering the stdvar, this patch doesn't introduce performance
overhead to the fast path.
[1] http://lkml.iu.edu/hypermail/linux/kernel/1304.1/04584.html
[2] http://lkml.iu.edu/hypermail/linux/kernel/1307.1/00535.html
[3] http://lkml.iu.edu/hypermail/linux/kernel/1512.0/01265.html
[4] http://lkml.kernel.org/g/56F89DCD.1040202@huawei.com
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: <acme@kernel.org>
Cc: <pi3orama@163.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Zefan Li <lizefan@huawei.com>
Link: http://lkml.kernel.org/r/1459865478-53413-1-git-send-email-wangnan0@huawei.com
[ Fixed the changelog some more. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-04-05 22:11:18 +08:00
|
|
|
struct perf_event *event, unsigned int size)
|
|
|
|
{
|
2020-10-30 22:50:32 +08:00
|
|
|
return __perf_output_begin(handle, data, event, size, true);
|
perf/core: Add ::write_backward attribute to perf event
This patch introduces 'write_backward' bit to perf_event_attr, which
controls the direction of a ring buffer. After set, the corresponding
ring buffer is written from end to beginning. This feature is design to
support reading from overwritable ring buffer.
Ring buffer can be created by mapping a perf event fd. Kernel puts event
records into ring buffer, user tooling like perf fetch them from
address returned by mmap(). To prevent racing between kernel and tooling,
they communicate to each other through 'head' and 'tail' pointers.
Kernel maintains 'head' pointer, points it to the next free area (tail
of the last record). Tooling maintains 'tail' pointer, points it to the
tail of last consumed record (record has already been fetched). Kernel
determines the available space in a ring buffer using these two
pointers to avoid overwrite unfetched records.
By mapping without 'PROT_WRITE', an overwritable ring buffer is created.
Different from normal ring buffer, tooling is unable to maintain 'tail'
pointer because writing is forbidden. Therefore, for this type of ring
buffers, kernel overwrite old records unconditionally, works like flight
recorder. This feature would be useful if reading from overwritable ring
buffer were as easy as reading from normal ring buffer. However,
there's an obscure problem.
The following figure demonstrates a full overwritable ring buffer. In
this figure, the 'head' pointer points to the end of last record, and a
long record 'E' is pending. For a normal ring buffer, a 'tail' pointer
would have pointed to position (X), so kernel knows there's no more
space in the ring buffer. However, for an overwritable ring buffer,
kernel ignore the 'tail' pointer.
(X) head
. |
. V
+------+-------+----------+------+---+
|A....A|B.....B|C........C|D....D| |
+------+-------+----------+------+---+
Record 'A' is overwritten by event 'E':
head
|
V
+--+---+-------+----------+------+---+
|.E|..A|B.....B|C........C|D....D|E..|
+--+---+-------+----------+------+---+
Now tooling decides to read from this ring buffer. However, none of these
two natural positions, 'head' and the start of this ring buffer, are
pointing to the head of a record. Even the full ring buffer can be
accessed by tooling, it is unable to find a position to start decoding.
The first attempt tries to solve this problem AFAIK can be found from
[1]. It makes kernel to maintain 'tail' pointer: updates it when ring
buffer is half full. However, this approach introduces overhead to
fast path. Test result shows a 1% overhead [2]. In addition, this method
utilizes no more tham 50% records.
Another attempt can be found from [3], which allows putting the size of
an event at the end of each record. This approach allows tooling to find
records in a backward manner from 'head' pointer by reading size of a
record from its tail. However, because of alignment requirement, it
needs 8 bytes to record the size of a record, which is a huge waste. Its
performance is also not good, because more data need to be written.
This approach also introduces some extra branch instructions to fast
path.
'write_backward' is a better solution to this problem.
Following figure demonstrates the state of the overwritable ring buffer
when 'write_backward' is set before overwriting:
head
|
V
+---+------+----------+-------+------+
| |D....D|C........C|B.....B|A....A|
+---+------+----------+-------+------+
and after overwriting:
head
|
V
+---+------+----------+-------+---+--+
|..E|D....D|C........C|B.....B|A..|E.|
+---+------+----------+-------+---+--+
In each situation, 'head' points to the beginning of the newest record.
From this record, tooling can iterate over the full ring buffer and fetch
records one by one.
The only limitation that needs to be considered is back-to-back reading.
Due to the non-deterministic of user programs, it is impossible to ensure
the ring buffer keeps stable during reading. Consider an extreme situation:
tooling is scheduled out after reading record 'D', then a burst of events
come, eat up the whole ring buffer (one or multiple rounds). When the
tooling process comes back, reading after 'D' is incorrect now.
To prevent this problem, we need to find a way to ensure the ring buffer
is stable during reading. ioctl(PERF_EVENT_IOC_PAUSE_OUTPUT) is
suggested because its overhead is lower than
ioctl(PERF_EVENT_IOC_ENABLE).
By carefully verifying 'header' pointer, reader can avoid pausing the
ring-buffer. For example:
/* A union of all possible events */
union perf_event event;
p = head = perf_mmap__read_head();
while (true) {
/* copy header of next event */
fetch(&event.header, p, sizeof(event.header));
/* read 'head' pointer */
head = perf_mmap__read_head();
/* check overwritten: is the header good? */
if (!verify(sizeof(event.header), p, head))
break;
/* copy the whole event */
fetch(&event, p, event.header.size);
/* read 'head' pointer again */
head = perf_mmap__read_head();
/* is the whole event good? */
if (!verify(event.header.size, p, head))
break;
p += event.header.size;
}
However, the overhead is high because:
a) In-place decoding is not safe.
Copying-verifying-decoding is required.
b) Fetching 'head' pointer requires additional synchronization.
(From Alexei Starovoitov:
Even when this trick works, pause is needed for more than stability of
reading. When we collect the events into overwrite buffer we're waiting
for some other trigger (like all cpu utilization spike or just one cpu
running and all others are idle) and when it happens the buffer has
valuable info from the past. At this point new events are no longer
interesting and buffer should be paused, events read and unpaused until
next trigger comes.)
This patch utilizes event's default overflow_handler introduced
previously. perf_event_output_backward() is created as the default
overflow handler for backward ring buffers. To avoid extra overhead to
fast path, original perf_event_output() becomes __perf_event_output()
and marked '__always_inline'. In theory, there's no extra overhead
introduced to fast path.
Performance testing:
Calling 3000000 times of 'close(-1)', use gettimeofday() to check
duration. Use 'perf record -o /dev/null -e raw_syscalls:*' to capture
system calls. In ns.
Testing environment:
CPU : Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz
Kernel : v4.5.0
MEAN STDVAR
BASE 800214.950 2853.083
PRE1 2253846.700 9997.014
PRE2 2257495.540 8516.293
POST 2250896.100 8933.921
Where 'BASE' is pure performance without capturing. 'PRE1' is test
result of pure 'v4.5.0' kernel. 'PRE2' is test result before this
patch. 'POST' is test result after this patch. See [4] for the detailed
experimental setup.
Considering the stdvar, this patch doesn't introduce performance
overhead to the fast path.
[1] http://lkml.iu.edu/hypermail/linux/kernel/1304.1/04584.html
[2] http://lkml.iu.edu/hypermail/linux/kernel/1307.1/00535.html
[3] http://lkml.iu.edu/hypermail/linux/kernel/1512.0/01265.html
[4] http://lkml.kernel.org/g/56F89DCD.1040202@huawei.com
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: <acme@kernel.org>
Cc: <pi3orama@163.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Zefan Li <lizefan@huawei.com>
Link: http://lkml.kernel.org/r/1459865478-53413-1-git-send-email-wangnan0@huawei.com
[ Fixed the changelog some more. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-04-05 22:11:18 +08:00
|
|
|
}
|
|
|
|
|
2016-03-28 14:41:31 +08:00
|
|
|
int perf_output_begin(struct perf_output_handle *handle,
|
2020-10-30 22:50:32 +08:00
|
|
|
struct perf_sample_data *data,
|
2016-03-28 14:41:31 +08:00
|
|
|
struct perf_event *event, unsigned int size)
|
|
|
|
{
|
perf/core: Add ::write_backward attribute to perf event
This patch introduces 'write_backward' bit to perf_event_attr, which
controls the direction of a ring buffer. After set, the corresponding
ring buffer is written from end to beginning. This feature is design to
support reading from overwritable ring buffer.
Ring buffer can be created by mapping a perf event fd. Kernel puts event
records into ring buffer, user tooling like perf fetch them from
address returned by mmap(). To prevent racing between kernel and tooling,
they communicate to each other through 'head' and 'tail' pointers.
Kernel maintains 'head' pointer, points it to the next free area (tail
of the last record). Tooling maintains 'tail' pointer, points it to the
tail of last consumed record (record has already been fetched). Kernel
determines the available space in a ring buffer using these two
pointers to avoid overwrite unfetched records.
By mapping without 'PROT_WRITE', an overwritable ring buffer is created.
Different from normal ring buffer, tooling is unable to maintain 'tail'
pointer because writing is forbidden. Therefore, for this type of ring
buffers, kernel overwrite old records unconditionally, works like flight
recorder. This feature would be useful if reading from overwritable ring
buffer were as easy as reading from normal ring buffer. However,
there's an obscure problem.
The following figure demonstrates a full overwritable ring buffer. In
this figure, the 'head' pointer points to the end of last record, and a
long record 'E' is pending. For a normal ring buffer, a 'tail' pointer
would have pointed to position (X), so kernel knows there's no more
space in the ring buffer. However, for an overwritable ring buffer,
kernel ignore the 'tail' pointer.
(X) head
. |
. V
+------+-------+----------+------+---+
|A....A|B.....B|C........C|D....D| |
+------+-------+----------+------+---+
Record 'A' is overwritten by event 'E':
head
|
V
+--+---+-------+----------+------+---+
|.E|..A|B.....B|C........C|D....D|E..|
+--+---+-------+----------+------+---+
Now tooling decides to read from this ring buffer. However, none of these
two natural positions, 'head' and the start of this ring buffer, are
pointing to the head of a record. Even the full ring buffer can be
accessed by tooling, it is unable to find a position to start decoding.
The first attempt tries to solve this problem AFAIK can be found from
[1]. It makes kernel to maintain 'tail' pointer: updates it when ring
buffer is half full. However, this approach introduces overhead to
fast path. Test result shows a 1% overhead [2]. In addition, this method
utilizes no more tham 50% records.
Another attempt can be found from [3], which allows putting the size of
an event at the end of each record. This approach allows tooling to find
records in a backward manner from 'head' pointer by reading size of a
record from its tail. However, because of alignment requirement, it
needs 8 bytes to record the size of a record, which is a huge waste. Its
performance is also not good, because more data need to be written.
This approach also introduces some extra branch instructions to fast
path.
'write_backward' is a better solution to this problem.
Following figure demonstrates the state of the overwritable ring buffer
when 'write_backward' is set before overwriting:
head
|
V
+---+------+----------+-------+------+
| |D....D|C........C|B.....B|A....A|
+---+------+----------+-------+------+
and after overwriting:
head
|
V
+---+------+----------+-------+---+--+
|..E|D....D|C........C|B.....B|A..|E.|
+---+------+----------+-------+---+--+
In each situation, 'head' points to the beginning of the newest record.
From this record, tooling can iterate over the full ring buffer and fetch
records one by one.
The only limitation that needs to be considered is back-to-back reading.
Due to the non-deterministic of user programs, it is impossible to ensure
the ring buffer keeps stable during reading. Consider an extreme situation:
tooling is scheduled out after reading record 'D', then a burst of events
come, eat up the whole ring buffer (one or multiple rounds). When the
tooling process comes back, reading after 'D' is incorrect now.
To prevent this problem, we need to find a way to ensure the ring buffer
is stable during reading. ioctl(PERF_EVENT_IOC_PAUSE_OUTPUT) is
suggested because its overhead is lower than
ioctl(PERF_EVENT_IOC_ENABLE).
By carefully verifying 'header' pointer, reader can avoid pausing the
ring-buffer. For example:
/* A union of all possible events */
union perf_event event;
p = head = perf_mmap__read_head();
while (true) {
/* copy header of next event */
fetch(&event.header, p, sizeof(event.header));
/* read 'head' pointer */
head = perf_mmap__read_head();
/* check overwritten: is the header good? */
if (!verify(sizeof(event.header), p, head))
break;
/* copy the whole event */
fetch(&event, p, event.header.size);
/* read 'head' pointer again */
head = perf_mmap__read_head();
/* is the whole event good? */
if (!verify(event.header.size, p, head))
break;
p += event.header.size;
}
However, the overhead is high because:
a) In-place decoding is not safe.
Copying-verifying-decoding is required.
b) Fetching 'head' pointer requires additional synchronization.
(From Alexei Starovoitov:
Even when this trick works, pause is needed for more than stability of
reading. When we collect the events into overwrite buffer we're waiting
for some other trigger (like all cpu utilization spike or just one cpu
running and all others are idle) and when it happens the buffer has
valuable info from the past. At this point new events are no longer
interesting and buffer should be paused, events read and unpaused until
next trigger comes.)
This patch utilizes event's default overflow_handler introduced
previously. perf_event_output_backward() is created as the default
overflow handler for backward ring buffers. To avoid extra overhead to
fast path, original perf_event_output() becomes __perf_event_output()
and marked '__always_inline'. In theory, there's no extra overhead
introduced to fast path.
Performance testing:
Calling 3000000 times of 'close(-1)', use gettimeofday() to check
duration. Use 'perf record -o /dev/null -e raw_syscalls:*' to capture
system calls. In ns.
Testing environment:
CPU : Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz
Kernel : v4.5.0
MEAN STDVAR
BASE 800214.950 2853.083
PRE1 2253846.700 9997.014
PRE2 2257495.540 8516.293
POST 2250896.100 8933.921
Where 'BASE' is pure performance without capturing. 'PRE1' is test
result of pure 'v4.5.0' kernel. 'PRE2' is test result before this
patch. 'POST' is test result after this patch. See [4] for the detailed
experimental setup.
Considering the stdvar, this patch doesn't introduce performance
overhead to the fast path.
[1] http://lkml.iu.edu/hypermail/linux/kernel/1304.1/04584.html
[2] http://lkml.iu.edu/hypermail/linux/kernel/1307.1/00535.html
[3] http://lkml.iu.edu/hypermail/linux/kernel/1512.0/01265.html
[4] http://lkml.kernel.org/g/56F89DCD.1040202@huawei.com
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: <acme@kernel.org>
Cc: <pi3orama@163.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Zefan Li <lizefan@huawei.com>
Link: http://lkml.kernel.org/r/1459865478-53413-1-git-send-email-wangnan0@huawei.com
[ Fixed the changelog some more. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-04-05 22:11:18 +08:00
|
|
|
|
2020-10-30 22:50:32 +08:00
|
|
|
return __perf_output_begin(handle, data, event, size,
|
perf/core: Add ::write_backward attribute to perf event
This patch introduces 'write_backward' bit to perf_event_attr, which
controls the direction of a ring buffer. After set, the corresponding
ring buffer is written from end to beginning. This feature is design to
support reading from overwritable ring buffer.
Ring buffer can be created by mapping a perf event fd. Kernel puts event
records into ring buffer, user tooling like perf fetch them from
address returned by mmap(). To prevent racing between kernel and tooling,
they communicate to each other through 'head' and 'tail' pointers.
Kernel maintains 'head' pointer, points it to the next free area (tail
of the last record). Tooling maintains 'tail' pointer, points it to the
tail of last consumed record (record has already been fetched). Kernel
determines the available space in a ring buffer using these two
pointers to avoid overwrite unfetched records.
By mapping without 'PROT_WRITE', an overwritable ring buffer is created.
Different from normal ring buffer, tooling is unable to maintain 'tail'
pointer because writing is forbidden. Therefore, for this type of ring
buffers, kernel overwrite old records unconditionally, works like flight
recorder. This feature would be useful if reading from overwritable ring
buffer were as easy as reading from normal ring buffer. However,
there's an obscure problem.
The following figure demonstrates a full overwritable ring buffer. In
this figure, the 'head' pointer points to the end of last record, and a
long record 'E' is pending. For a normal ring buffer, a 'tail' pointer
would have pointed to position (X), so kernel knows there's no more
space in the ring buffer. However, for an overwritable ring buffer,
kernel ignore the 'tail' pointer.
(X) head
. |
. V
+------+-------+----------+------+---+
|A....A|B.....B|C........C|D....D| |
+------+-------+----------+------+---+
Record 'A' is overwritten by event 'E':
head
|
V
+--+---+-------+----------+------+---+
|.E|..A|B.....B|C........C|D....D|E..|
+--+---+-------+----------+------+---+
Now tooling decides to read from this ring buffer. However, none of these
two natural positions, 'head' and the start of this ring buffer, are
pointing to the head of a record. Even the full ring buffer can be
accessed by tooling, it is unable to find a position to start decoding.
The first attempt tries to solve this problem AFAIK can be found from
[1]. It makes kernel to maintain 'tail' pointer: updates it when ring
buffer is half full. However, this approach introduces overhead to
fast path. Test result shows a 1% overhead [2]. In addition, this method
utilizes no more tham 50% records.
Another attempt can be found from [3], which allows putting the size of
an event at the end of each record. This approach allows tooling to find
records in a backward manner from 'head' pointer by reading size of a
record from its tail. However, because of alignment requirement, it
needs 8 bytes to record the size of a record, which is a huge waste. Its
performance is also not good, because more data need to be written.
This approach also introduces some extra branch instructions to fast
path.
'write_backward' is a better solution to this problem.
Following figure demonstrates the state of the overwritable ring buffer
when 'write_backward' is set before overwriting:
head
|
V
+---+------+----------+-------+------+
| |D....D|C........C|B.....B|A....A|
+---+------+----------+-------+------+
and after overwriting:
head
|
V
+---+------+----------+-------+---+--+
|..E|D....D|C........C|B.....B|A..|E.|
+---+------+----------+-------+---+--+
In each situation, 'head' points to the beginning of the newest record.
From this record, tooling can iterate over the full ring buffer and fetch
records one by one.
The only limitation that needs to be considered is back-to-back reading.
Due to the non-deterministic of user programs, it is impossible to ensure
the ring buffer keeps stable during reading. Consider an extreme situation:
tooling is scheduled out after reading record 'D', then a burst of events
come, eat up the whole ring buffer (one or multiple rounds). When the
tooling process comes back, reading after 'D' is incorrect now.
To prevent this problem, we need to find a way to ensure the ring buffer
is stable during reading. ioctl(PERF_EVENT_IOC_PAUSE_OUTPUT) is
suggested because its overhead is lower than
ioctl(PERF_EVENT_IOC_ENABLE).
By carefully verifying 'header' pointer, reader can avoid pausing the
ring-buffer. For example:
/* A union of all possible events */
union perf_event event;
p = head = perf_mmap__read_head();
while (true) {
/* copy header of next event */
fetch(&event.header, p, sizeof(event.header));
/* read 'head' pointer */
head = perf_mmap__read_head();
/* check overwritten: is the header good? */
if (!verify(sizeof(event.header), p, head))
break;
/* copy the whole event */
fetch(&event, p, event.header.size);
/* read 'head' pointer again */
head = perf_mmap__read_head();
/* is the whole event good? */
if (!verify(event.header.size, p, head))
break;
p += event.header.size;
}
However, the overhead is high because:
a) In-place decoding is not safe.
Copying-verifying-decoding is required.
b) Fetching 'head' pointer requires additional synchronization.
(From Alexei Starovoitov:
Even when this trick works, pause is needed for more than stability of
reading. When we collect the events into overwrite buffer we're waiting
for some other trigger (like all cpu utilization spike or just one cpu
running and all others are idle) and when it happens the buffer has
valuable info from the past. At this point new events are no longer
interesting and buffer should be paused, events read and unpaused until
next trigger comes.)
This patch utilizes event's default overflow_handler introduced
previously. perf_event_output_backward() is created as the default
overflow handler for backward ring buffers. To avoid extra overhead to
fast path, original perf_event_output() becomes __perf_event_output()
and marked '__always_inline'. In theory, there's no extra overhead
introduced to fast path.
Performance testing:
Calling 3000000 times of 'close(-1)', use gettimeofday() to check
duration. Use 'perf record -o /dev/null -e raw_syscalls:*' to capture
system calls. In ns.
Testing environment:
CPU : Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz
Kernel : v4.5.0
MEAN STDVAR
BASE 800214.950 2853.083
PRE1 2253846.700 9997.014
PRE2 2257495.540 8516.293
POST 2250896.100 8933.921
Where 'BASE' is pure performance without capturing. 'PRE1' is test
result of pure 'v4.5.0' kernel. 'PRE2' is test result before this
patch. 'POST' is test result after this patch. See [4] for the detailed
experimental setup.
Considering the stdvar, this patch doesn't introduce performance
overhead to the fast path.
[1] http://lkml.iu.edu/hypermail/linux/kernel/1304.1/04584.html
[2] http://lkml.iu.edu/hypermail/linux/kernel/1307.1/00535.html
[3] http://lkml.iu.edu/hypermail/linux/kernel/1512.0/01265.html
[4] http://lkml.kernel.org/g/56F89DCD.1040202@huawei.com
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: <acme@kernel.org>
Cc: <pi3orama@163.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Zefan Li <lizefan@huawei.com>
Link: http://lkml.kernel.org/r/1459865478-53413-1-git-send-email-wangnan0@huawei.com
[ Fixed the changelog some more. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-04-05 22:11:18 +08:00
|
|
|
unlikely(is_write_backward(event)));
|
2016-03-28 14:41:31 +08:00
|
|
|
}
|
|
|
|
|
2012-08-07 21:20:38 +08:00
|
|
|
unsigned int perf_output_copy(struct perf_output_handle *handle,
|
2011-05-20 01:55:04 +08:00
|
|
|
const void *buf, unsigned int len)
|
|
|
|
{
|
2012-08-07 21:20:38 +08:00
|
|
|
return __output_copy(handle, buf, len);
|
2011-05-20 01:55:04 +08:00
|
|
|
}
|
|
|
|
|
2012-08-07 21:20:39 +08:00
|
|
|
unsigned int perf_output_skip(struct perf_output_handle *handle,
|
|
|
|
unsigned int len)
|
|
|
|
{
|
|
|
|
return __output_skip(handle, NULL, len);
|
|
|
|
}
|
|
|
|
|
2011-05-20 01:55:04 +08:00
|
|
|
void perf_output_end(struct perf_output_handle *handle)
|
|
|
|
{
|
|
|
|
perf_output_put_handle(handle);
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2019-12-14 02:21:30 +08:00
|
|
|
ring_buffer_init(struct perf_buffer *rb, long watermark, int flags)
|
2011-05-20 01:55:04 +08:00
|
|
|
{
|
|
|
|
long max_size = perf_data_size(rb);
|
|
|
|
|
|
|
|
if (watermark)
|
|
|
|
rb->watermark = min(max_size, watermark);
|
|
|
|
|
|
|
|
if (!rb->watermark)
|
|
|
|
rb->watermark = max_size / 2;
|
|
|
|
|
|
|
|
if (flags & RING_BUFFER_WRITABLE)
|
2013-03-18 21:33:28 +08:00
|
|
|
rb->overwrite = 0;
|
|
|
|
else
|
|
|
|
rb->overwrite = 1;
|
2011-05-20 01:55:04 +08:00
|
|
|
|
2019-01-28 20:27:27 +08:00
|
|
|
refcount_set(&rb->refcount, 1);
|
perf: Fix loss of notification with multi-event
When you do:
$ perf record -e cycles,cycles,cycles noploop 10
You expect about 10,000 samples for each event, i.e., 10s at
1000samples/sec. However, this is not what's happening. You
get much fewer samples, maybe 3700 samples/event:
$ perf report -D | tail -15
Aggregated stats:
TOTAL events: 10998
MMAP events: 66
COMM events: 2
SAMPLE events: 10930
cycles stats:
TOTAL events: 3644
SAMPLE events: 3644
cycles stats:
TOTAL events: 3642
SAMPLE events: 3642
cycles stats:
TOTAL events: 3644
SAMPLE events: 3644
On a Intel Nehalem or even AMD64, there are 4 counters capable
of measuring cycles, so there is plenty of space to measure those
events without multiplexing (even with the NMI watchdog active).
And even with multiplexing, we'd expect roughly the same number
of samples per event.
The root of the problem was that when the event that caused the buffer
to become full was not the first event passed on the cmdline, the user
notification would get lost. The notification was sent to the file
descriptor of the overflowed event but the perf tool was not polling
on it. The perf tool aggregates all samples into a single buffer,
i.e., the buffer of the first event. Consequently, it assumes
notifications for any event will come via that descriptor.
The seemingly straight forward solution of moving the waitq into the
ringbuffer object doesn't work because of life-time issues. One could
perf_event_set_output() on a fd that you're also blocking on and cause
the old rb object to be freed while its waitq would still be
referenced by the blocked thread -> FAIL.
Therefore link all events to the ringbuffer and broadcast the wakeup
from the ringbuffer object to all possible events that could be waited
upon. This is rather ugly, and we're open to better solutions but it
works for now.
Reported-by: Stephane Eranian <eranian@google.com>
Finished-by: Stephane Eranian <eranian@google.com>
Reviewed-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20111126014731.GA7030@quad
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-11-26 09:47:31 +08:00
|
|
|
|
|
|
|
INIT_LIST_HEAD(&rb->event_list);
|
|
|
|
spin_lock_init(&rb->event_lock);
|
2016-03-28 14:41:29 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* perf_output_begin() only checks rb->paused, therefore
|
|
|
|
* rb->paused must be true if we have no pages for output.
|
|
|
|
*/
|
|
|
|
if (!rb->nr_pages)
|
|
|
|
rb->paused = 1;
|
2011-05-20 01:55:04 +08:00
|
|
|
}
|
|
|
|
|
2017-02-20 21:33:50 +08:00
|
|
|
void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* OVERWRITE is determined by perf_aux_output_end() and can't
|
|
|
|
* be passed in directly.
|
|
|
|
*/
|
|
|
|
if (WARN_ON_ONCE(flags & PERF_AUX_FLAG_OVERWRITE))
|
|
|
|
return;
|
|
|
|
|
|
|
|
handle->aux_flags |= flags;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(perf_aux_output_flag);
|
|
|
|
|
perf: Add API for PMUs to write to the AUX area
For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.
After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.
PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.
Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.
Nested writers are forbidden and guards are in place to catch such
attempts.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:16 +08:00
|
|
|
/*
|
|
|
|
* This is called before hardware starts writing to the AUX area to
|
|
|
|
* obtain an output handle and make sure there's room in the buffer.
|
|
|
|
* When the capture completes, call perf_aux_output_end() to commit
|
|
|
|
* the recorded data to the buffer.
|
|
|
|
*
|
|
|
|
* The ordering is similar to that of perf_output_{begin,end}, with
|
|
|
|
* the exception of (B), which should be taken care of by the pmu
|
|
|
|
* driver, since ordering rules will differ depending on hardware.
|
2016-03-04 21:42:47 +08:00
|
|
|
*
|
|
|
|
* Call this from pmu::start(); see the comment in perf_aux_output_end()
|
|
|
|
* about its use in pmu callbacks. Both can also be called from the PMI
|
|
|
|
* handler if needed.
|
perf: Add API for PMUs to write to the AUX area
For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.
After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.
PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.
Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.
Nested writers are forbidden and guards are in place to catch such
attempts.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:16 +08:00
|
|
|
*/
|
|
|
|
void *perf_aux_output_begin(struct perf_output_handle *handle,
|
|
|
|
struct perf_event *event)
|
|
|
|
{
|
|
|
|
struct perf_event *output_event = event;
|
|
|
|
unsigned long aux_head, aux_tail;
|
2019-12-14 02:21:30 +08:00
|
|
|
struct perf_buffer *rb;
|
2019-05-17 19:52:34 +08:00
|
|
|
unsigned int nest;
|
perf: Add API for PMUs to write to the AUX area
For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.
After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.
PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.
Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.
Nested writers are forbidden and guards are in place to catch such
attempts.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:16 +08:00
|
|
|
|
|
|
|
if (output_event->parent)
|
|
|
|
output_event = output_event->parent;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Since this will typically be open across pmu::add/pmu::del, we
|
|
|
|
* grab ring_buffer's refcount instead of holding rcu read lock
|
|
|
|
* to make sure it doesn't disappear under us.
|
|
|
|
*/
|
|
|
|
rb = ring_buffer_get(output_event);
|
|
|
|
if (!rb)
|
|
|
|
return NULL;
|
|
|
|
|
2016-09-06 21:23:50 +08:00
|
|
|
if (!rb_has_aux(rb))
|
perf: Add API for PMUs to write to the AUX area
For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.
After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.
PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.
Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.
Nested writers are forbidden and guards are in place to catch such
attempts.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:16 +08:00
|
|
|
goto err;
|
|
|
|
|
2016-03-04 21:42:45 +08:00
|
|
|
/*
|
2016-09-06 21:23:50 +08:00
|
|
|
* If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(),
|
|
|
|
* about to get freed, so we leave immediately.
|
|
|
|
*
|
|
|
|
* Checking rb::aux_mmap_count and rb::refcount has to be done in
|
|
|
|
* the same order, see perf_mmap_close. Otherwise we end up freeing
|
|
|
|
* aux pages in this path, which is a bug, because in_atomic().
|
2016-03-04 21:42:45 +08:00
|
|
|
*/
|
|
|
|
if (!atomic_read(&rb->aux_mmap_count))
|
2016-09-06 21:23:50 +08:00
|
|
|
goto err;
|
|
|
|
|
2019-01-28 20:27:28 +08:00
|
|
|
if (!refcount_inc_not_zero(&rb->aux_refcount))
|
2016-09-06 21:23:50 +08:00
|
|
|
goto err;
|
2016-03-04 21:42:45 +08:00
|
|
|
|
2019-05-17 19:52:34 +08:00
|
|
|
nest = READ_ONCE(rb->aux_nest);
|
perf: Add API for PMUs to write to the AUX area
For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.
After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.
PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.
Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.
Nested writers are forbidden and guards are in place to catch such
attempts.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:16 +08:00
|
|
|
/*
|
|
|
|
* Nesting is not supported for AUX area, make sure nested
|
|
|
|
* writers are caught early
|
|
|
|
*/
|
2019-05-17 19:52:34 +08:00
|
|
|
if (WARN_ON_ONCE(nest))
|
perf: Add API for PMUs to write to the AUX area
For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.
After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.
PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.
Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.
Nested writers are forbidden and guards are in place to catch such
attempts.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:16 +08:00
|
|
|
goto err_put;
|
|
|
|
|
2019-05-17 19:52:34 +08:00
|
|
|
WRITE_ONCE(rb->aux_nest, nest + 1);
|
|
|
|
|
2017-08-17 00:18:16 +08:00
|
|
|
aux_head = rb->aux_head;
|
perf: Add API for PMUs to write to the AUX area
For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.
After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.
PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.
Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.
Nested writers are forbidden and guards are in place to catch such
attempts.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:16 +08:00
|
|
|
|
|
|
|
handle->rb = rb;
|
|
|
|
handle->event = event;
|
|
|
|
handle->head = aux_head;
|
2015-01-14 20:18:17 +08:00
|
|
|
handle->size = 0;
|
2017-02-20 21:33:50 +08:00
|
|
|
handle->aux_flags = 0;
|
perf: Add API for PMUs to write to the AUX area
For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.
After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.
PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.
Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.
Nested writers are forbidden and guards are in place to catch such
attempts.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:16 +08:00
|
|
|
|
|
|
|
/*
|
2015-01-14 20:18:17 +08:00
|
|
|
* In overwrite mode, AUX data stores do not depend on aux_tail,
|
|
|
|
* therefore (A) control dependency barrier does not exist. The
|
|
|
|
* (B) <-> (C) ordering is still observed by the pmu driver.
|
perf: Add API for PMUs to write to the AUX area
For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.
After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.
PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.
Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.
Nested writers are forbidden and guards are in place to catch such
attempts.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:16 +08:00
|
|
|
*/
|
2015-01-14 20:18:17 +08:00
|
|
|
if (!rb->aux_overwrite) {
|
locking/atomics: COCCINELLE/treewide: Convert trivial ACCESS_ONCE() patterns to READ_ONCE()/WRITE_ONCE()
Please do not apply this to mainline directly, instead please re-run the
coccinelle script shown below and apply its output.
For several reasons, it is desirable to use {READ,WRITE}_ONCE() in
preference to ACCESS_ONCE(), and new code is expected to use one of the
former. So far, there's been no reason to change most existing uses of
ACCESS_ONCE(), as these aren't harmful, and changing them results in
churn.
However, for some features, the read/write distinction is critical to
correct operation. To distinguish these cases, separate read/write
accessors must be used. This patch migrates (most) remaining
ACCESS_ONCE() instances to {READ,WRITE}_ONCE(), using the following
coccinelle script:
----
// Convert trivial ACCESS_ONCE() uses to equivalent READ_ONCE() and
// WRITE_ONCE()
// $ make coccicheck COCCI=/home/mark/once.cocci SPFLAGS="--include-headers" MODE=patch
virtual patch
@ depends on patch @
expression E1, E2;
@@
- ACCESS_ONCE(E1) = E2
+ WRITE_ONCE(E1, E2)
@ depends on patch @
expression E;
@@
- ACCESS_ONCE(E)
+ READ_ONCE(E)
----
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: davem@davemloft.net
Cc: linux-arch@vger.kernel.org
Cc: mpe@ellerman.id.au
Cc: shuah@kernel.org
Cc: snitzer@redhat.com
Cc: thor.thayer@linux.intel.com
Cc: tj@kernel.org
Cc: viro@zeniv.linux.org.uk
Cc: will.deacon@arm.com
Link: http://lkml.kernel.org/r/1508792849-3115-19-git-send-email-paulmck@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-24 05:07:29 +08:00
|
|
|
aux_tail = READ_ONCE(rb->user_page->aux_tail);
|
2017-08-17 00:18:16 +08:00
|
|
|
handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
|
2015-01-14 20:18:17 +08:00
|
|
|
if (aux_head - aux_tail < perf_aux_size(rb))
|
|
|
|
handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* handle->size computation depends on aux_tail load; this forms a
|
|
|
|
* control dependency barrier separating aux_tail load from aux data
|
|
|
|
* store that will be enabled on successful return
|
|
|
|
*/
|
|
|
|
if (!handle->size) { /* A, matches D */
|
2019-04-04 21:03:00 +08:00
|
|
|
event->pending_disable = smp_processor_id();
|
2015-01-14 20:18:17 +08:00
|
|
|
perf_output_wakeup(handle);
|
2019-05-17 19:52:34 +08:00
|
|
|
WRITE_ONCE(rb->aux_nest, 0);
|
2015-01-14 20:18:17 +08:00
|
|
|
goto err_put;
|
|
|
|
}
|
perf: Add API for PMUs to write to the AUX area
For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.
After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.
PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.
Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.
Nested writers are forbidden and guards are in place to catch such
attempts.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:16 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return handle->rb->aux_priv;
|
|
|
|
|
|
|
|
err_put:
|
2016-03-04 21:42:47 +08:00
|
|
|
/* can't be last */
|
perf: Add API for PMUs to write to the AUX area
For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.
After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.
PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.
Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.
Nested writers are forbidden and guards are in place to catch such
attempts.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:16 +08:00
|
|
|
rb_free_aux(rb);
|
|
|
|
|
|
|
|
err:
|
2015-12-03 00:41:11 +08:00
|
|
|
ring_buffer_put(rb);
|
perf: Add API for PMUs to write to the AUX area
For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.
After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.
PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.
Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.
Nested writers are forbidden and guards are in place to catch such
attempts.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:16 +08:00
|
|
|
handle->event = NULL;
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
2016-08-16 23:53:15 +08:00
|
|
|
EXPORT_SYMBOL_GPL(perf_aux_output_begin);
|
perf: Add API for PMUs to write to the AUX area
For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.
After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.
PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.
Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.
Nested writers are forbidden and guards are in place to catch such
attempts.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:16 +08:00
|
|
|
|
2019-12-14 02:21:30 +08:00
|
|
|
static __always_inline bool rb_need_aux_wakeup(struct perf_buffer *rb)
|
2017-09-07 00:08:11 +08:00
|
|
|
{
|
|
|
|
if (rb->aux_overwrite)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
|
|
|
|
rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
perf: Add API for PMUs to write to the AUX area
For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.
After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.
PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.
Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.
Nested writers are forbidden and guards are in place to catch such
attempts.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:16 +08:00
|
|
|
/*
|
|
|
|
* Commit the data written by hardware into the ring buffer by adjusting
|
|
|
|
* aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
|
|
|
|
* pmu driver's responsibility to observe ordering rules of the hardware,
|
|
|
|
* so that all the data is externally visible before this is called.
|
2016-03-04 21:42:47 +08:00
|
|
|
*
|
|
|
|
* Note: this has to be called from pmu::stop() callback, as the assumption
|
|
|
|
* of the AUX buffer management code is that after pmu::stop(), the AUX
|
|
|
|
* transaction must be stopped and therefore drop the AUX reference count.
|
perf: Add API for PMUs to write to the AUX area
For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.
After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.
PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.
Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.
Nested writers are forbidden and guards are in place to catch such
attempts.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:16 +08:00
|
|
|
*/
|
2017-02-20 21:33:50 +08:00
|
|
|
void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
|
perf: Add API for PMUs to write to the AUX area
For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.
After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.
PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.
Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.
Nested writers are forbidden and guards are in place to catch such
attempts.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:16 +08:00
|
|
|
{
|
2017-02-20 21:33:51 +08:00
|
|
|
bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED);
|
2019-12-14 02:21:30 +08:00
|
|
|
struct perf_buffer *rb = handle->rb;
|
2015-01-14 20:18:17 +08:00
|
|
|
unsigned long aux_head;
|
perf: Add API for PMUs to write to the AUX area
For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.
After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.
PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.
Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.
Nested writers are forbidden and guards are in place to catch such
attempts.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:16 +08:00
|
|
|
|
2015-01-14 20:18:17 +08:00
|
|
|
/* in overwrite mode, driver provides aux_head via handle */
|
|
|
|
if (rb->aux_overwrite) {
|
2017-02-20 21:33:50 +08:00
|
|
|
handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE;
|
2015-01-14 20:18:17 +08:00
|
|
|
|
|
|
|
aux_head = handle->head;
|
2017-08-17 00:18:16 +08:00
|
|
|
rb->aux_head = aux_head;
|
2015-01-14 20:18:17 +08:00
|
|
|
} else {
|
2017-02-20 21:33:50 +08:00
|
|
|
handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE;
|
|
|
|
|
2017-08-17 00:18:16 +08:00
|
|
|
aux_head = rb->aux_head;
|
|
|
|
rb->aux_head += size;
|
2015-01-14 20:18:17 +08:00
|
|
|
}
|
perf: Add API for PMUs to write to the AUX area
For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.
After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.
PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.
Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.
Nested writers are forbidden and guards are in place to catch such
attempts.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:16 +08:00
|
|
|
|
2019-03-29 17:13:38 +08:00
|
|
|
/*
|
|
|
|
* Only send RECORD_AUX if we have something useful to communicate
|
|
|
|
*
|
|
|
|
* Note: the OVERWRITE records by themselves are not considered
|
|
|
|
* useful, as they don't communicate any *new* information,
|
|
|
|
* aside from the short-lived offset, that becomes history at
|
|
|
|
* the next event sched-in and therefore isn't useful.
|
|
|
|
* The userspace that needs to copy out AUX data in overwrite
|
|
|
|
* mode should know to use user_page::aux_head for the actual
|
|
|
|
* offset. So, from now on we don't output AUX records that
|
|
|
|
* have *only* OVERWRITE flag set.
|
|
|
|
*/
|
|
|
|
if (size || (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE))
|
|
|
|
perf_event_aux_event(handle->event, aux_head, size,
|
|
|
|
handle->aux_flags);
|
perf: Add API for PMUs to write to the AUX area
For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.
After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.
PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.
Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.
Nested writers are forbidden and guards are in place to catch such
attempts.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:16 +08:00
|
|
|
|
2019-05-17 19:52:33 +08:00
|
|
|
WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
|
2017-09-07 00:08:11 +08:00
|
|
|
if (rb_need_aux_wakeup(rb))
|
2016-05-10 21:18:33 +08:00
|
|
|
wakeup = true;
|
|
|
|
|
|
|
|
if (wakeup) {
|
2017-02-20 21:33:50 +08:00
|
|
|
if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
|
2019-04-04 21:03:00 +08:00
|
|
|
handle->event->pending_disable = smp_processor_id();
|
2016-05-10 21:18:33 +08:00
|
|
|
perf_output_wakeup(handle);
|
|
|
|
}
|
|
|
|
|
perf: Add API for PMUs to write to the AUX area
For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.
After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.
PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.
Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.
Nested writers are forbidden and guards are in place to catch such
attempts.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:16 +08:00
|
|
|
handle->event = NULL;
|
|
|
|
|
2019-05-17 19:52:34 +08:00
|
|
|
WRITE_ONCE(rb->aux_nest, 0);
|
2016-03-04 21:42:47 +08:00
|
|
|
/* can't be last */
|
perf: Add API for PMUs to write to the AUX area
For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.
After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.
PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.
Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.
Nested writers are forbidden and guards are in place to catch such
attempts.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:16 +08:00
|
|
|
rb_free_aux(rb);
|
2015-12-03 00:41:11 +08:00
|
|
|
ring_buffer_put(rb);
|
perf: Add API for PMUs to write to the AUX area
For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.
After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.
PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.
Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.
Nested writers are forbidden and guards are in place to catch such
attempts.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:16 +08:00
|
|
|
}
|
2016-08-16 23:53:15 +08:00
|
|
|
EXPORT_SYMBOL_GPL(perf_aux_output_end);
|
perf: Add API for PMUs to write to the AUX area
For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.
After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.
PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.
Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.
Nested writers are forbidden and guards are in place to catch such
attempts.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:16 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Skip over a given number of bytes in the AUX buffer, due to, for example,
|
|
|
|
* hardware's alignment constraints.
|
|
|
|
*/
|
|
|
|
int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
|
|
|
|
{
|
2019-12-14 02:21:30 +08:00
|
|
|
struct perf_buffer *rb = handle->rb;
|
perf: Add API for PMUs to write to the AUX area
For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.
After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.
PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.
Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.
Nested writers are forbidden and guards are in place to catch such
attempts.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:16 +08:00
|
|
|
|
|
|
|
if (size > handle->size)
|
|
|
|
return -ENOSPC;
|
|
|
|
|
2017-08-17 00:18:16 +08:00
|
|
|
rb->aux_head += size;
|
perf: Add API for PMUs to write to the AUX area
For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.
After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.
PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.
Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.
Nested writers are forbidden and guards are in place to catch such
attempts.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:16 +08:00
|
|
|
|
2019-05-17 19:52:33 +08:00
|
|
|
WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
|
2017-09-07 00:08:11 +08:00
|
|
|
if (rb_need_aux_wakeup(rb)) {
|
2015-01-14 20:18:18 +08:00
|
|
|
perf_output_wakeup(handle);
|
2017-08-17 00:18:16 +08:00
|
|
|
handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
|
2015-01-14 20:18:18 +08:00
|
|
|
}
|
|
|
|
|
2017-08-17 00:18:16 +08:00
|
|
|
handle->head = rb->aux_head;
|
perf: Add API for PMUs to write to the AUX area
For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.
After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.
PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.
Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.
Nested writers are forbidden and guards are in place to catch such
attempts.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:16 +08:00
|
|
|
handle->size -= size;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2016-08-16 23:53:15 +08:00
|
|
|
EXPORT_SYMBOL_GPL(perf_aux_output_skip);
|
perf: Add API for PMUs to write to the AUX area
For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.
After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.
PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.
Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.
Nested writers are forbidden and guards are in place to catch such
attempts.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:16 +08:00
|
|
|
|
|
|
|
void *perf_get_aux(struct perf_output_handle *handle)
|
|
|
|
{
|
|
|
|
/* this is only valid between perf_aux_output_begin and *_end */
|
|
|
|
if (!handle->event)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
return handle->rb->aux_priv;
|
|
|
|
}
|
2016-08-16 23:53:15 +08:00
|
|
|
EXPORT_SYMBOL_GPL(perf_get_aux);
|
perf: Add API for PMUs to write to the AUX area
For pmus that wish to write data to ring buffer's AUX area, provide
perf_aux_output_{begin,end}() calls to initiate/commit data writes,
similarly to perf_output_{begin,end}. These also use the same output
handle structure. Also, similarly to software counterparts, these
will direct inherited events' output to parents' ring buffers.
After the perf_aux_output_begin() returns successfully, handle->size
is set to the maximum amount of data that can be written wrt aux_tail
pointer, so that no data that the user hasn't seen will be overwritten,
therefore this should always be called before hardware writing is
enabled. On success, this will return the pointer to pmu driver's
private structure allocated for this aux area by pmu::setup_aux. Same
pointer can also be retrieved using perf_get_aux() while hardware
writing is enabled.
PMU driver should pass the actual amount of data written as a parameter
to perf_aux_output_end(). All hardware writes should be completed and
visible before this one is called.
Additionally, perf_aux_output_skip() will adjust output handle and
aux_head in case some part of the buffer has to be skipped over to
maintain hardware's alignment constraints.
Nested writers are forbidden and guards are in place to catch such
attempts.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:16 +08:00
|
|
|
|
2019-10-25 22:08:33 +08:00
|
|
|
/*
|
|
|
|
* Copy out AUX data from an AUX handle.
|
|
|
|
*/
|
|
|
|
long perf_output_copy_aux(struct perf_output_handle *aux_handle,
|
|
|
|
struct perf_output_handle *handle,
|
|
|
|
unsigned long from, unsigned long to)
|
|
|
|
{
|
2019-12-14 02:21:30 +08:00
|
|
|
struct perf_buffer *rb = aux_handle->rb;
|
2019-10-25 22:08:33 +08:00
|
|
|
unsigned long tocopy, remainder, len = 0;
|
|
|
|
void *addr;
|
|
|
|
|
|
|
|
from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
|
|
|
|
to &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
|
|
|
|
|
|
|
|
do {
|
|
|
|
tocopy = PAGE_SIZE - offset_in_page(from);
|
|
|
|
if (to > from)
|
|
|
|
tocopy = min(tocopy, to - from);
|
|
|
|
if (!tocopy)
|
|
|
|
break;
|
|
|
|
|
|
|
|
addr = rb->aux_pages[from >> PAGE_SHIFT];
|
|
|
|
addr += offset_in_page(from);
|
|
|
|
|
|
|
|
remainder = perf_output_copy(handle, addr, tocopy);
|
|
|
|
if (remainder)
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
len += tocopy;
|
|
|
|
from += tocopy;
|
|
|
|
from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
|
|
|
|
} while (to != from);
|
|
|
|
|
|
|
|
return len;
|
|
|
|
}
|
|
|
|
|
2015-01-14 20:18:12 +08:00
|
|
|
#define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
|
|
|
|
|
|
|
|
static struct page *rb_alloc_aux_page(int node, int order)
|
|
|
|
{
|
|
|
|
struct page *page;
|
|
|
|
|
|
|
|
if (order > MAX_ORDER)
|
|
|
|
order = MAX_ORDER;
|
|
|
|
|
|
|
|
do {
|
|
|
|
page = alloc_pages_node(node, PERF_AUX_GFP, order);
|
|
|
|
} while (!page && order--);
|
|
|
|
|
|
|
|
if (page && order) {
|
|
|
|
/*
|
2015-07-28 14:00:04 +08:00
|
|
|
* Communicate the allocation size to the driver:
|
|
|
|
* if we managed to secure a high-order allocation,
|
|
|
|
* set its first page's private to this order;
|
|
|
|
* !PagePrivate(page) means it's just a normal page.
|
2015-01-14 20:18:12 +08:00
|
|
|
*/
|
|
|
|
split_page(page, order);
|
|
|
|
SetPagePrivate(page);
|
|
|
|
set_page_private(page, order);
|
|
|
|
}
|
|
|
|
|
|
|
|
return page;
|
|
|
|
}
|
|
|
|
|
2019-12-14 02:21:30 +08:00
|
|
|
static void rb_free_aux_page(struct perf_buffer *rb, int idx)
|
2015-01-14 20:18:12 +08:00
|
|
|
{
|
|
|
|
struct page *page = virt_to_page(rb->aux_pages[idx]);
|
|
|
|
|
|
|
|
ClearPagePrivate(page);
|
|
|
|
page->mapping = NULL;
|
|
|
|
__free_page(page);
|
|
|
|
}
|
|
|
|
|
2019-12-14 02:21:30 +08:00
|
|
|
static void __rb_free_aux(struct perf_buffer *rb)
|
2016-01-19 23:14:29 +08:00
|
|
|
{
|
|
|
|
int pg;
|
|
|
|
|
2015-12-03 00:41:11 +08:00
|
|
|
/*
|
|
|
|
* Should never happen, the last reference should be dropped from
|
|
|
|
* perf_mmap_close() path, which first stops aux transactions (which
|
|
|
|
* in turn are the atomic holders of aux_refcount) and then does the
|
|
|
|
* last rb_free_aux().
|
|
|
|
*/
|
|
|
|
WARN_ON_ONCE(in_atomic());
|
|
|
|
|
2016-01-19 23:14:29 +08:00
|
|
|
if (rb->aux_priv) {
|
|
|
|
rb->free_aux(rb->aux_priv);
|
|
|
|
rb->free_aux = NULL;
|
|
|
|
rb->aux_priv = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (rb->aux_nr_pages) {
|
|
|
|
for (pg = 0; pg < rb->aux_nr_pages; pg++)
|
|
|
|
rb_free_aux_page(rb, pg);
|
|
|
|
|
|
|
|
kfree(rb->aux_pages);
|
|
|
|
rb->aux_nr_pages = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-12-14 02:21:30 +08:00
|
|
|
int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
|
2015-01-14 20:18:18 +08:00
|
|
|
pgoff_t pgoff, int nr_pages, long watermark, int flags)
|
perf: Add AUX area to ring buffer for raw data streams
This patch introduces "AUX space" in the perf mmap buffer, intended for
exporting high bandwidth data streams to userspace, such as instruction
flow traces.
AUX space is a ring buffer, defined by aux_{offset,size} fields in the
user_page structure, and read/write pointers aux_{head,tail}, which abide
by the same rules as data_* counterparts of the main perf buffer.
In order to allocate/mmap AUX, userspace needs to set up aux_offset to
such an offset that will be greater than data_offset+data_size and
aux_size to be the desired buffer size. Both need to be page aligned.
Then, same aux_offset and aux_size should be passed to mmap() call and
if everything adds up, you should have an AUX buffer as a result.
Pages that are mapped into this buffer also come out of user's mlock
rlimit plus perf_event_mlock_kb allowance.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-3-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:11 +08:00
|
|
|
{
|
|
|
|
bool overwrite = !(flags & RING_BUFFER_WRITABLE);
|
|
|
|
int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
|
2019-02-15 19:47:27 +08:00
|
|
|
int ret = -ENOMEM, max_order;
|
perf: Add AUX area to ring buffer for raw data streams
This patch introduces "AUX space" in the perf mmap buffer, intended for
exporting high bandwidth data streams to userspace, such as instruction
flow traces.
AUX space is a ring buffer, defined by aux_{offset,size} fields in the
user_page structure, and read/write pointers aux_{head,tail}, which abide
by the same rules as data_* counterparts of the main perf buffer.
In order to allocate/mmap AUX, userspace needs to set up aux_offset to
such an offset that will be greater than data_offset+data_size and
aux_size to be the desired buffer size. Both need to be page aligned.
Then, same aux_offset and aux_size should be passed to mmap() call and
if everything adds up, you should have an AUX buffer as a result.
Pages that are mapped into this buffer also come out of user's mlock
rlimit plus perf_event_mlock_kb allowance.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-3-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:11 +08:00
|
|
|
|
|
|
|
if (!has_aux(event))
|
2017-06-20 18:26:39 +08:00
|
|
|
return -EOPNOTSUPP;
|
perf: Add AUX area to ring buffer for raw data streams
This patch introduces "AUX space" in the perf mmap buffer, intended for
exporting high bandwidth data streams to userspace, such as instruction
flow traces.
AUX space is a ring buffer, defined by aux_{offset,size} fields in the
user_page structure, and read/write pointers aux_{head,tail}, which abide
by the same rules as data_* counterparts of the main perf buffer.
In order to allocate/mmap AUX, userspace needs to set up aux_offset to
such an offset that will be greater than data_offset+data_size and
aux_size to be the desired buffer size. Both need to be page aligned.
Then, same aux_offset and aux_size should be passed to mmap() call and
if everything adds up, you should have an AUX buffer as a result.
Pages that are mapped into this buffer also come out of user's mlock
rlimit plus perf_event_mlock_kb allowance.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-3-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:11 +08:00
|
|
|
|
2019-05-03 16:55:35 +08:00
|
|
|
if (!overwrite) {
|
2021-04-14 23:49:54 +08:00
|
|
|
/*
|
|
|
|
* Watermark defaults to half the buffer, and so does the
|
|
|
|
* max_order, to aid PMU drivers in double buffering.
|
|
|
|
*/
|
|
|
|
if (!watermark)
|
|
|
|
watermark = nr_pages << (PAGE_SHIFT - 1);
|
2015-01-14 20:18:13 +08:00
|
|
|
|
2021-04-14 23:49:54 +08:00
|
|
|
/*
|
|
|
|
* Use aux_watermark as the basis for chunking to
|
|
|
|
* help PMU drivers honor the watermark.
|
|
|
|
*/
|
|
|
|
max_order = get_order(watermark);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* We need to start with the max_order that fits in nr_pages,
|
|
|
|
* not the other way around, hence ilog2() and not get_order.
|
|
|
|
*/
|
|
|
|
max_order = ilog2(nr_pages);
|
|
|
|
watermark = 0;
|
2015-01-14 20:18:13 +08:00
|
|
|
}
|
|
|
|
|
treewide: kzalloc_node() -> kcalloc_node()
The kzalloc_node() function has a 2-factor argument form, kcalloc_node(). This
patch replaces cases of:
kzalloc_node(a * b, gfp, node)
with:
kcalloc_node(a * b, gfp, node)
as well as handling cases of:
kzalloc_node(a * b * c, gfp, node)
with:
kzalloc_node(array3_size(a, b, c), gfp, node)
as it's slightly less ugly than:
kcalloc_node(array_size(a, b), c, gfp, node)
This does, however, attempt to ignore constant size factors like:
kzalloc_node(4 * 1024, gfp, node)
though any constants defined via macros get caught up in the conversion.
Any factors with a sizeof() of "unsigned char", "char", and "u8" were
dropped, since they're redundant.
The Coccinelle script used for this was:
// Fix redundant parens around sizeof().
@@
type TYPE;
expression THING, E;
@@
(
kzalloc_node(
- (sizeof(TYPE)) * E
+ sizeof(TYPE) * E
, ...)
|
kzalloc_node(
- (sizeof(THING)) * E
+ sizeof(THING) * E
, ...)
)
// Drop single-byte sizes and redundant parens.
@@
expression COUNT;
typedef u8;
typedef __u8;
@@
(
kzalloc_node(
- sizeof(u8) * (COUNT)
+ COUNT
, ...)
|
kzalloc_node(
- sizeof(__u8) * (COUNT)
+ COUNT
, ...)
|
kzalloc_node(
- sizeof(char) * (COUNT)
+ COUNT
, ...)
|
kzalloc_node(
- sizeof(unsigned char) * (COUNT)
+ COUNT
, ...)
|
kzalloc_node(
- sizeof(u8) * COUNT
+ COUNT
, ...)
|
kzalloc_node(
- sizeof(__u8) * COUNT
+ COUNT
, ...)
|
kzalloc_node(
- sizeof(char) * COUNT
+ COUNT
, ...)
|
kzalloc_node(
- sizeof(unsigned char) * COUNT
+ COUNT
, ...)
)
// 2-factor product with sizeof(type/expression) and identifier or constant.
@@
type TYPE;
expression THING;
identifier COUNT_ID;
constant COUNT_CONST;
@@
(
- kzalloc_node
+ kcalloc_node
(
- sizeof(TYPE) * (COUNT_ID)
+ COUNT_ID, sizeof(TYPE)
, ...)
|
- kzalloc_node
+ kcalloc_node
(
- sizeof(TYPE) * COUNT_ID
+ COUNT_ID, sizeof(TYPE)
, ...)
|
- kzalloc_node
+ kcalloc_node
(
- sizeof(TYPE) * (COUNT_CONST)
+ COUNT_CONST, sizeof(TYPE)
, ...)
|
- kzalloc_node
+ kcalloc_node
(
- sizeof(TYPE) * COUNT_CONST
+ COUNT_CONST, sizeof(TYPE)
, ...)
|
- kzalloc_node
+ kcalloc_node
(
- sizeof(THING) * (COUNT_ID)
+ COUNT_ID, sizeof(THING)
, ...)
|
- kzalloc_node
+ kcalloc_node
(
- sizeof(THING) * COUNT_ID
+ COUNT_ID, sizeof(THING)
, ...)
|
- kzalloc_node
+ kcalloc_node
(
- sizeof(THING) * (COUNT_CONST)
+ COUNT_CONST, sizeof(THING)
, ...)
|
- kzalloc_node
+ kcalloc_node
(
- sizeof(THING) * COUNT_CONST
+ COUNT_CONST, sizeof(THING)
, ...)
)
// 2-factor product, only identifiers.
@@
identifier SIZE, COUNT;
@@
- kzalloc_node
+ kcalloc_node
(
- SIZE * COUNT
+ COUNT, SIZE
, ...)
// 3-factor product with 1 sizeof(type) or sizeof(expression), with
// redundant parens removed.
@@
expression THING;
identifier STRIDE, COUNT;
type TYPE;
@@
(
kzalloc_node(
- sizeof(TYPE) * (COUNT) * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kzalloc_node(
- sizeof(TYPE) * (COUNT) * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kzalloc_node(
- sizeof(TYPE) * COUNT * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kzalloc_node(
- sizeof(TYPE) * COUNT * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kzalloc_node(
- sizeof(THING) * (COUNT) * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kzalloc_node(
- sizeof(THING) * (COUNT) * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kzalloc_node(
- sizeof(THING) * COUNT * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kzalloc_node(
- sizeof(THING) * COUNT * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
)
// 3-factor product with 2 sizeof(variable), with redundant parens removed.
@@
expression THING1, THING2;
identifier COUNT;
type TYPE1, TYPE2;
@@
(
kzalloc_node(
- sizeof(TYPE1) * sizeof(TYPE2) * COUNT
+ array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
, ...)
|
kzalloc_node(
- sizeof(TYPE1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
, ...)
|
kzalloc_node(
- sizeof(THING1) * sizeof(THING2) * COUNT
+ array3_size(COUNT, sizeof(THING1), sizeof(THING2))
, ...)
|
kzalloc_node(
- sizeof(THING1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(THING1), sizeof(THING2))
, ...)
|
kzalloc_node(
- sizeof(TYPE1) * sizeof(THING2) * COUNT
+ array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
, ...)
|
kzalloc_node(
- sizeof(TYPE1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
, ...)
)
// 3-factor product, only identifiers, with redundant parens removed.
@@
identifier STRIDE, SIZE, COUNT;
@@
(
kzalloc_node(
- (COUNT) * STRIDE * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc_node(
- COUNT * (STRIDE) * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc_node(
- COUNT * STRIDE * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc_node(
- (COUNT) * (STRIDE) * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc_node(
- COUNT * (STRIDE) * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc_node(
- (COUNT) * STRIDE * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc_node(
- (COUNT) * (STRIDE) * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc_node(
- COUNT * STRIDE * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
)
// Any remaining multi-factor products, first at least 3-factor products,
// when they're not all constants...
@@
expression E1, E2, E3;
constant C1, C2, C3;
@@
(
kzalloc_node(C1 * C2 * C3, ...)
|
kzalloc_node(
- (E1) * E2 * E3
+ array3_size(E1, E2, E3)
, ...)
|
kzalloc_node(
- (E1) * (E2) * E3
+ array3_size(E1, E2, E3)
, ...)
|
kzalloc_node(
- (E1) * (E2) * (E3)
+ array3_size(E1, E2, E3)
, ...)
|
kzalloc_node(
- E1 * E2 * E3
+ array3_size(E1, E2, E3)
, ...)
)
// And then all remaining 2 factors products when they're not all constants,
// keeping sizeof() as the second factor argument.
@@
expression THING, E1, E2;
type TYPE;
constant C1, C2, C3;
@@
(
kzalloc_node(sizeof(THING) * C2, ...)
|
kzalloc_node(sizeof(TYPE) * C2, ...)
|
kzalloc_node(C1 * C2 * C3, ...)
|
kzalloc_node(C1 * C2, ...)
|
- kzalloc_node
+ kcalloc_node
(
- sizeof(TYPE) * (E2)
+ E2, sizeof(TYPE)
, ...)
|
- kzalloc_node
+ kcalloc_node
(
- sizeof(TYPE) * E2
+ E2, sizeof(TYPE)
, ...)
|
- kzalloc_node
+ kcalloc_node
(
- sizeof(THING) * (E2)
+ E2, sizeof(THING)
, ...)
|
- kzalloc_node
+ kcalloc_node
(
- sizeof(THING) * E2
+ E2, sizeof(THING)
, ...)
|
- kzalloc_node
+ kcalloc_node
(
- (E1) * E2
+ E1, E2
, ...)
|
- kzalloc_node
+ kcalloc_node
(
- (E1) * (E2)
+ E1, E2
, ...)
|
- kzalloc_node
+ kcalloc_node
(
- E1 * E2
+ E1, E2
, ...)
)
Signed-off-by: Kees Cook <keescook@chromium.org>
2018-06-13 05:04:20 +08:00
|
|
|
rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL,
|
|
|
|
node);
|
perf: Add AUX area to ring buffer for raw data streams
This patch introduces "AUX space" in the perf mmap buffer, intended for
exporting high bandwidth data streams to userspace, such as instruction
flow traces.
AUX space is a ring buffer, defined by aux_{offset,size} fields in the
user_page structure, and read/write pointers aux_{head,tail}, which abide
by the same rules as data_* counterparts of the main perf buffer.
In order to allocate/mmap AUX, userspace needs to set up aux_offset to
such an offset that will be greater than data_offset+data_size and
aux_size to be the desired buffer size. Both need to be page aligned.
Then, same aux_offset and aux_size should be passed to mmap() call and
if everything adds up, you should have an AUX buffer as a result.
Pages that are mapped into this buffer also come out of user's mlock
rlimit plus perf_event_mlock_kb allowance.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-3-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:11 +08:00
|
|
|
if (!rb->aux_pages)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
rb->free_aux = event->pmu->free_aux;
|
2015-01-14 20:18:12 +08:00
|
|
|
for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) {
|
perf: Add AUX area to ring buffer for raw data streams
This patch introduces "AUX space" in the perf mmap buffer, intended for
exporting high bandwidth data streams to userspace, such as instruction
flow traces.
AUX space is a ring buffer, defined by aux_{offset,size} fields in the
user_page structure, and read/write pointers aux_{head,tail}, which abide
by the same rules as data_* counterparts of the main perf buffer.
In order to allocate/mmap AUX, userspace needs to set up aux_offset to
such an offset that will be greater than data_offset+data_size and
aux_size to be the desired buffer size. Both need to be page aligned.
Then, same aux_offset and aux_size should be passed to mmap() call and
if everything adds up, you should have an AUX buffer as a result.
Pages that are mapped into this buffer also come out of user's mlock
rlimit plus perf_event_mlock_kb allowance.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-3-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:11 +08:00
|
|
|
struct page *page;
|
2015-01-14 20:18:12 +08:00
|
|
|
int last, order;
|
perf: Add AUX area to ring buffer for raw data streams
This patch introduces "AUX space" in the perf mmap buffer, intended for
exporting high bandwidth data streams to userspace, such as instruction
flow traces.
AUX space is a ring buffer, defined by aux_{offset,size} fields in the
user_page structure, and read/write pointers aux_{head,tail}, which abide
by the same rules as data_* counterparts of the main perf buffer.
In order to allocate/mmap AUX, userspace needs to set up aux_offset to
such an offset that will be greater than data_offset+data_size and
aux_size to be the desired buffer size. Both need to be page aligned.
Then, same aux_offset and aux_size should be passed to mmap() call and
if everything adds up, you should have an AUX buffer as a result.
Pages that are mapped into this buffer also come out of user's mlock
rlimit plus perf_event_mlock_kb allowance.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-3-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:11 +08:00
|
|
|
|
2015-01-14 20:18:12 +08:00
|
|
|
order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages));
|
|
|
|
page = rb_alloc_aux_page(node, order);
|
perf: Add AUX area to ring buffer for raw data streams
This patch introduces "AUX space" in the perf mmap buffer, intended for
exporting high bandwidth data streams to userspace, such as instruction
flow traces.
AUX space is a ring buffer, defined by aux_{offset,size} fields in the
user_page structure, and read/write pointers aux_{head,tail}, which abide
by the same rules as data_* counterparts of the main perf buffer.
In order to allocate/mmap AUX, userspace needs to set up aux_offset to
such an offset that will be greater than data_offset+data_size and
aux_size to be the desired buffer size. Both need to be page aligned.
Then, same aux_offset and aux_size should be passed to mmap() call and
if everything adds up, you should have an AUX buffer as a result.
Pages that are mapped into this buffer also come out of user's mlock
rlimit plus perf_event_mlock_kb allowance.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-3-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:11 +08:00
|
|
|
if (!page)
|
|
|
|
goto out;
|
|
|
|
|
2015-01-14 20:18:12 +08:00
|
|
|
for (last = rb->aux_nr_pages + (1 << page_private(page));
|
|
|
|
last > rb->aux_nr_pages; rb->aux_nr_pages++)
|
|
|
|
rb->aux_pages[rb->aux_nr_pages] = page_address(page++);
|
perf: Add AUX area to ring buffer for raw data streams
This patch introduces "AUX space" in the perf mmap buffer, intended for
exporting high bandwidth data streams to userspace, such as instruction
flow traces.
AUX space is a ring buffer, defined by aux_{offset,size} fields in the
user_page structure, and read/write pointers aux_{head,tail}, which abide
by the same rules as data_* counterparts of the main perf buffer.
In order to allocate/mmap AUX, userspace needs to set up aux_offset to
such an offset that will be greater than data_offset+data_size and
aux_size to be the desired buffer size. Both need to be page aligned.
Then, same aux_offset and aux_size should be passed to mmap() call and
if everything adds up, you should have an AUX buffer as a result.
Pages that are mapped into this buffer also come out of user's mlock
rlimit plus perf_event_mlock_kb allowance.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-3-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:11 +08:00
|
|
|
}
|
|
|
|
|
2015-05-22 23:30:20 +08:00
|
|
|
/*
|
|
|
|
* In overwrite mode, PMUs that don't support SG may not handle more
|
|
|
|
* than one contiguous allocation, since they rely on PMI to do double
|
|
|
|
* buffering. In this case, the entire buffer has to be one contiguous
|
|
|
|
* chunk.
|
|
|
|
*/
|
|
|
|
if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) &&
|
|
|
|
overwrite) {
|
|
|
|
struct page *page = virt_to_page(rb->aux_pages[0]);
|
|
|
|
|
|
|
|
if (page_private(page) != max_order)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2019-02-01 02:47:08 +08:00
|
|
|
rb->aux_priv = event->pmu->setup_aux(event, rb->aux_pages, nr_pages,
|
perf: Add AUX area to ring buffer for raw data streams
This patch introduces "AUX space" in the perf mmap buffer, intended for
exporting high bandwidth data streams to userspace, such as instruction
flow traces.
AUX space is a ring buffer, defined by aux_{offset,size} fields in the
user_page structure, and read/write pointers aux_{head,tail}, which abide
by the same rules as data_* counterparts of the main perf buffer.
In order to allocate/mmap AUX, userspace needs to set up aux_offset to
such an offset that will be greater than data_offset+data_size and
aux_size to be the desired buffer size. Both need to be page aligned.
Then, same aux_offset and aux_size should be passed to mmap() call and
if everything adds up, you should have an AUX buffer as a result.
Pages that are mapped into this buffer also come out of user's mlock
rlimit plus perf_event_mlock_kb allowance.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-3-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:11 +08:00
|
|
|
overwrite);
|
|
|
|
if (!rb->aux_priv)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* aux_pages (and pmu driver's private data, aux_priv) will be
|
|
|
|
* referenced in both producer's and consumer's contexts, thus
|
|
|
|
* we keep a refcount here to make sure either of the two can
|
|
|
|
* reference them safely.
|
|
|
|
*/
|
2019-01-28 20:27:28 +08:00
|
|
|
refcount_set(&rb->aux_refcount, 1);
|
perf: Add AUX area to ring buffer for raw data streams
This patch introduces "AUX space" in the perf mmap buffer, intended for
exporting high bandwidth data streams to userspace, such as instruction
flow traces.
AUX space is a ring buffer, defined by aux_{offset,size} fields in the
user_page structure, and read/write pointers aux_{head,tail}, which abide
by the same rules as data_* counterparts of the main perf buffer.
In order to allocate/mmap AUX, userspace needs to set up aux_offset to
such an offset that will be greater than data_offset+data_size and
aux_size to be the desired buffer size. Both need to be page aligned.
Then, same aux_offset and aux_size should be passed to mmap() call and
if everything adds up, you should have an AUX buffer as a result.
Pages that are mapped into this buffer also come out of user's mlock
rlimit plus perf_event_mlock_kb allowance.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-3-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:11 +08:00
|
|
|
|
2015-01-14 20:18:17 +08:00
|
|
|
rb->aux_overwrite = overwrite;
|
2015-01-14 20:18:18 +08:00
|
|
|
rb->aux_watermark = watermark;
|
|
|
|
|
perf: Add AUX area to ring buffer for raw data streams
This patch introduces "AUX space" in the perf mmap buffer, intended for
exporting high bandwidth data streams to userspace, such as instruction
flow traces.
AUX space is a ring buffer, defined by aux_{offset,size} fields in the
user_page structure, and read/write pointers aux_{head,tail}, which abide
by the same rules as data_* counterparts of the main perf buffer.
In order to allocate/mmap AUX, userspace needs to set up aux_offset to
such an offset that will be greater than data_offset+data_size and
aux_size to be the desired buffer size. Both need to be page aligned.
Then, same aux_offset and aux_size should be passed to mmap() call and
if everything adds up, you should have an AUX buffer as a result.
Pages that are mapped into this buffer also come out of user's mlock
rlimit plus perf_event_mlock_kb allowance.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-3-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:11 +08:00
|
|
|
out:
|
|
|
|
if (!ret)
|
|
|
|
rb->aux_pgoff = pgoff;
|
|
|
|
else
|
2016-01-19 23:14:29 +08:00
|
|
|
__rb_free_aux(rb);
|
perf: Add AUX area to ring buffer for raw data streams
This patch introduces "AUX space" in the perf mmap buffer, intended for
exporting high bandwidth data streams to userspace, such as instruction
flow traces.
AUX space is a ring buffer, defined by aux_{offset,size} fields in the
user_page structure, and read/write pointers aux_{head,tail}, which abide
by the same rules as data_* counterparts of the main perf buffer.
In order to allocate/mmap AUX, userspace needs to set up aux_offset to
such an offset that will be greater than data_offset+data_size and
aux_size to be the desired buffer size. Both need to be page aligned.
Then, same aux_offset and aux_size should be passed to mmap() call and
if everything adds up, you should have an AUX buffer as a result.
Pages that are mapped into this buffer also come out of user's mlock
rlimit plus perf_event_mlock_kb allowance.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-3-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:11 +08:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2019-12-14 02:21:30 +08:00
|
|
|
void rb_free_aux(struct perf_buffer *rb)
|
perf: Add AUX area to ring buffer for raw data streams
This patch introduces "AUX space" in the perf mmap buffer, intended for
exporting high bandwidth data streams to userspace, such as instruction
flow traces.
AUX space is a ring buffer, defined by aux_{offset,size} fields in the
user_page structure, and read/write pointers aux_{head,tail}, which abide
by the same rules as data_* counterparts of the main perf buffer.
In order to allocate/mmap AUX, userspace needs to set up aux_offset to
such an offset that will be greater than data_offset+data_size and
aux_size to be the desired buffer size. Both need to be page aligned.
Then, same aux_offset and aux_size should be passed to mmap() call and
if everything adds up, you should have an AUX buffer as a result.
Pages that are mapped into this buffer also come out of user's mlock
rlimit plus perf_event_mlock_kb allowance.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-3-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:11 +08:00
|
|
|
{
|
2019-01-28 20:27:28 +08:00
|
|
|
if (refcount_dec_and_test(&rb->aux_refcount))
|
perf: Add AUX area to ring buffer for raw data streams
This patch introduces "AUX space" in the perf mmap buffer, intended for
exporting high bandwidth data streams to userspace, such as instruction
flow traces.
AUX space is a ring buffer, defined by aux_{offset,size} fields in the
user_page structure, and read/write pointers aux_{head,tail}, which abide
by the same rules as data_* counterparts of the main perf buffer.
In order to allocate/mmap AUX, userspace needs to set up aux_offset to
such an offset that will be greater than data_offset+data_size and
aux_size to be the desired buffer size. Both need to be page aligned.
Then, same aux_offset and aux_size should be passed to mmap() call and
if everything adds up, you should have an AUX buffer as a result.
Pages that are mapped into this buffer also come out of user's mlock
rlimit plus perf_event_mlock_kb allowance.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-3-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:11 +08:00
|
|
|
__rb_free_aux(rb);
|
|
|
|
}
|
|
|
|
|
2011-05-20 01:55:04 +08:00
|
|
|
#ifndef CONFIG_PERF_USE_VMALLOC
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Back perf_mmap() with regular GFP_KERNEL-0 pages.
|
|
|
|
*/
|
|
|
|
|
perf: Add AUX area to ring buffer for raw data streams
This patch introduces "AUX space" in the perf mmap buffer, intended for
exporting high bandwidth data streams to userspace, such as instruction
flow traces.
AUX space is a ring buffer, defined by aux_{offset,size} fields in the
user_page structure, and read/write pointers aux_{head,tail}, which abide
by the same rules as data_* counterparts of the main perf buffer.
In order to allocate/mmap AUX, userspace needs to set up aux_offset to
such an offset that will be greater than data_offset+data_size and
aux_size to be the desired buffer size. Both need to be page aligned.
Then, same aux_offset and aux_size should be passed to mmap() call and
if everything adds up, you should have an AUX buffer as a result.
Pages that are mapped into this buffer also come out of user's mlock
rlimit plus perf_event_mlock_kb allowance.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-3-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:11 +08:00
|
|
|
static struct page *
|
2019-12-14 02:21:30 +08:00
|
|
|
__perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
|
2011-05-20 01:55:04 +08:00
|
|
|
{
|
|
|
|
if (pgoff > rb->nr_pages)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
if (pgoff == 0)
|
|
|
|
return virt_to_page(rb->user_page);
|
|
|
|
|
|
|
|
return virt_to_page(rb->data_pages[pgoff - 1]);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void *perf_mmap_alloc_page(int cpu)
|
|
|
|
{
|
|
|
|
struct page *page;
|
|
|
|
int node;
|
|
|
|
|
|
|
|
node = (cpu == -1) ? cpu : cpu_to_node(cpu);
|
|
|
|
page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
|
|
|
|
if (!page)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
return page_address(page);
|
|
|
|
}
|
|
|
|
|
2019-10-14 16:15:57 +08:00
|
|
|
static void perf_mmap_free_page(void *addr)
|
|
|
|
{
|
|
|
|
struct page *page = virt_to_page(addr);
|
|
|
|
|
|
|
|
page->mapping = NULL;
|
|
|
|
__free_page(page);
|
|
|
|
}
|
|
|
|
|
2019-12-14 02:21:30 +08:00
|
|
|
struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
|
2011-05-20 01:55:04 +08:00
|
|
|
{
|
2019-12-14 02:21:30 +08:00
|
|
|
struct perf_buffer *rb;
|
2011-05-20 01:55:04 +08:00
|
|
|
unsigned long size;
|
2021-03-15 11:34:36 +08:00
|
|
|
int i, node;
|
2011-05-20 01:55:04 +08:00
|
|
|
|
2019-12-14 02:21:30 +08:00
|
|
|
size = sizeof(struct perf_buffer);
|
2011-05-20 01:55:04 +08:00
|
|
|
size += nr_pages * sizeof(void *);
|
|
|
|
|
2019-02-13 14:57:02 +08:00
|
|
|
if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER)
|
2019-01-10 22:27:45 +08:00
|
|
|
goto fail;
|
|
|
|
|
2021-03-15 11:34:36 +08:00
|
|
|
node = (cpu == -1) ? cpu : cpu_to_node(cpu);
|
|
|
|
rb = kzalloc_node(size, GFP_KERNEL, node);
|
2011-05-20 01:55:04 +08:00
|
|
|
if (!rb)
|
|
|
|
goto fail;
|
|
|
|
|
|
|
|
rb->user_page = perf_mmap_alloc_page(cpu);
|
|
|
|
if (!rb->user_page)
|
|
|
|
goto fail_user_page;
|
|
|
|
|
|
|
|
for (i = 0; i < nr_pages; i++) {
|
|
|
|
rb->data_pages[i] = perf_mmap_alloc_page(cpu);
|
|
|
|
if (!rb->data_pages[i])
|
|
|
|
goto fail_data_pages;
|
|
|
|
}
|
|
|
|
|
|
|
|
rb->nr_pages = nr_pages;
|
|
|
|
|
|
|
|
ring_buffer_init(rb, watermark, flags);
|
|
|
|
|
|
|
|
return rb;
|
|
|
|
|
|
|
|
fail_data_pages:
|
|
|
|
for (i--; i >= 0; i--)
|
2019-10-14 16:15:57 +08:00
|
|
|
perf_mmap_free_page(rb->data_pages[i]);
|
2011-05-20 01:55:04 +08:00
|
|
|
|
2019-10-14 16:15:57 +08:00
|
|
|
perf_mmap_free_page(rb->user_page);
|
2011-05-20 01:55:04 +08:00
|
|
|
|
|
|
|
fail_user_page:
|
|
|
|
kfree(rb);
|
|
|
|
|
|
|
|
fail:
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2019-12-14 02:21:30 +08:00
|
|
|
void rb_free(struct perf_buffer *rb)
|
2011-05-20 01:55:04 +08:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2019-10-14 16:14:59 +08:00
|
|
|
perf_mmap_free_page(rb->user_page);
|
2011-05-20 01:55:04 +08:00
|
|
|
for (i = 0; i < rb->nr_pages; i++)
|
2019-10-14 16:14:59 +08:00
|
|
|
perf_mmap_free_page(rb->data_pages[i]);
|
2011-05-20 01:55:04 +08:00
|
|
|
kfree(rb);
|
|
|
|
}
|
|
|
|
|
|
|
|
#else
|
perf: Add AUX area to ring buffer for raw data streams
This patch introduces "AUX space" in the perf mmap buffer, intended for
exporting high bandwidth data streams to userspace, such as instruction
flow traces.
AUX space is a ring buffer, defined by aux_{offset,size} fields in the
user_page structure, and read/write pointers aux_{head,tail}, which abide
by the same rules as data_* counterparts of the main perf buffer.
In order to allocate/mmap AUX, userspace needs to set up aux_offset to
such an offset that will be greater than data_offset+data_size and
aux_size to be the desired buffer size. Both need to be page aligned.
Then, same aux_offset and aux_size should be passed to mmap() call and
if everything adds up, you should have an AUX buffer as a result.
Pages that are mapped into this buffer also come out of user's mlock
rlimit plus perf_event_mlock_kb allowance.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-3-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:11 +08:00
|
|
|
static struct page *
|
2019-12-14 02:21:30 +08:00
|
|
|
__perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
|
2011-05-20 01:55:04 +08:00
|
|
|
{
|
2013-03-19 22:35:09 +08:00
|
|
|
/* The '>' counts in the user page. */
|
|
|
|
if (pgoff > data_page_nr(rb))
|
2011-05-20 01:55:04 +08:00
|
|
|
return NULL;
|
|
|
|
|
|
|
|
return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void perf_mmap_unmark_page(void *addr)
|
|
|
|
{
|
|
|
|
struct page *page = vmalloc_to_page(addr);
|
|
|
|
|
|
|
|
page->mapping = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void rb_free_work(struct work_struct *work)
|
|
|
|
{
|
2019-12-14 02:21:30 +08:00
|
|
|
struct perf_buffer *rb;
|
2011-05-20 01:55:04 +08:00
|
|
|
void *base;
|
|
|
|
int i, nr;
|
|
|
|
|
2019-12-14 02:21:30 +08:00
|
|
|
rb = container_of(work, struct perf_buffer, work);
|
2013-03-19 22:35:09 +08:00
|
|
|
nr = data_page_nr(rb);
|
2011-05-20 01:55:04 +08:00
|
|
|
|
|
|
|
base = rb->user_page;
|
2013-03-19 22:35:09 +08:00
|
|
|
/* The '<=' counts in the user page. */
|
|
|
|
for (i = 0; i <= nr; i++)
|
2011-05-20 01:55:04 +08:00
|
|
|
perf_mmap_unmark_page(base + (i * PAGE_SIZE));
|
|
|
|
|
|
|
|
vfree(base);
|
|
|
|
kfree(rb);
|
|
|
|
}
|
|
|
|
|
2019-12-14 02:21:30 +08:00
|
|
|
void rb_free(struct perf_buffer *rb)
|
2011-05-20 01:55:04 +08:00
|
|
|
{
|
|
|
|
schedule_work(&rb->work);
|
|
|
|
}
|
|
|
|
|
2019-12-14 02:21:30 +08:00
|
|
|
struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
|
2011-05-20 01:55:04 +08:00
|
|
|
{
|
2019-12-14 02:21:30 +08:00
|
|
|
struct perf_buffer *rb;
|
2011-05-20 01:55:04 +08:00
|
|
|
unsigned long size;
|
|
|
|
void *all_buf;
|
2021-03-15 11:34:36 +08:00
|
|
|
int node;
|
2011-05-20 01:55:04 +08:00
|
|
|
|
2019-12-14 02:21:30 +08:00
|
|
|
size = sizeof(struct perf_buffer);
|
2011-05-20 01:55:04 +08:00
|
|
|
size += sizeof(void *);
|
|
|
|
|
2021-03-15 11:34:36 +08:00
|
|
|
node = (cpu == -1) ? cpu : cpu_to_node(cpu);
|
|
|
|
rb = kzalloc_node(size, GFP_KERNEL, node);
|
2011-05-20 01:55:04 +08:00
|
|
|
if (!rb)
|
|
|
|
goto fail;
|
|
|
|
|
|
|
|
INIT_WORK(&rb->work, rb_free_work);
|
|
|
|
|
|
|
|
all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
|
|
|
|
if (!all_buf)
|
|
|
|
goto fail_all_buf;
|
|
|
|
|
|
|
|
rb->user_page = all_buf;
|
|
|
|
rb->data_pages[0] = all_buf + PAGE_SIZE;
|
2016-01-29 22:17:51 +08:00
|
|
|
if (nr_pages) {
|
|
|
|
rb->nr_pages = 1;
|
|
|
|
rb->page_order = ilog2(nr_pages);
|
|
|
|
}
|
2011-05-20 01:55:04 +08:00
|
|
|
|
|
|
|
ring_buffer_init(rb, watermark, flags);
|
|
|
|
|
|
|
|
return rb;
|
|
|
|
|
|
|
|
fail_all_buf:
|
|
|
|
kfree(rb);
|
|
|
|
|
|
|
|
fail:
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
perf: Add AUX area to ring buffer for raw data streams
This patch introduces "AUX space" in the perf mmap buffer, intended for
exporting high bandwidth data streams to userspace, such as instruction
flow traces.
AUX space is a ring buffer, defined by aux_{offset,size} fields in the
user_page structure, and read/write pointers aux_{head,tail}, which abide
by the same rules as data_* counterparts of the main perf buffer.
In order to allocate/mmap AUX, userspace needs to set up aux_offset to
such an offset that will be greater than data_offset+data_size and
aux_size to be the desired buffer size. Both need to be page aligned.
Then, same aux_offset and aux_size should be passed to mmap() call and
if everything adds up, you should have an AUX buffer as a result.
Pages that are mapped into this buffer also come out of user's mlock
rlimit plus perf_event_mlock_kb allowance.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-3-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:11 +08:00
|
|
|
|
|
|
|
struct page *
|
2019-12-14 02:21:30 +08:00
|
|
|
perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
|
perf: Add AUX area to ring buffer for raw data streams
This patch introduces "AUX space" in the perf mmap buffer, intended for
exporting high bandwidth data streams to userspace, such as instruction
flow traces.
AUX space is a ring buffer, defined by aux_{offset,size} fields in the
user_page structure, and read/write pointers aux_{head,tail}, which abide
by the same rules as data_* counterparts of the main perf buffer.
In order to allocate/mmap AUX, userspace needs to set up aux_offset to
such an offset that will be greater than data_offset+data_size and
aux_size to be the desired buffer size. Both need to be page aligned.
Then, same aux_offset and aux_size should be passed to mmap() call and
if everything adds up, you should have an AUX buffer as a result.
Pages that are mapped into this buffer also come out of user's mlock
rlimit plus perf_event_mlock_kb allowance.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-3-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:11 +08:00
|
|
|
{
|
|
|
|
if (rb->aux_nr_pages) {
|
|
|
|
/* above AUX space */
|
|
|
|
if (pgoff > rb->aux_pgoff + rb->aux_nr_pages)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
/* AUX space */
|
2018-04-20 20:03:18 +08:00
|
|
|
if (pgoff >= rb->aux_pgoff) {
|
|
|
|
int aux_pgoff = array_index_nospec(pgoff - rb->aux_pgoff, rb->aux_nr_pages);
|
|
|
|
return virt_to_page(rb->aux_pages[aux_pgoff]);
|
|
|
|
}
|
perf: Add AUX area to ring buffer for raw data streams
This patch introduces "AUX space" in the perf mmap buffer, intended for
exporting high bandwidth data streams to userspace, such as instruction
flow traces.
AUX space is a ring buffer, defined by aux_{offset,size} fields in the
user_page structure, and read/write pointers aux_{head,tail}, which abide
by the same rules as data_* counterparts of the main perf buffer.
In order to allocate/mmap AUX, userspace needs to set up aux_offset to
such an offset that will be greater than data_offset+data_size and
aux_size to be the desired buffer size. Both need to be page aligned.
Then, same aux_offset and aux_size should be passed to mmap() call and
if everything adds up, you should have an AUX buffer as a result.
Pages that are mapped into this buffer also come out of user's mlock
rlimit plus perf_event_mlock_kb allowance.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-3-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-01-14 20:18:11 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return __perf_mmap_to_page(rb, pgoff);
|
|
|
|
}
|