linux/kernel/trace/ring_buffer.c

6174 lines
164 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/*
* Generic ring buffer
*
* Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
*/
#include <linux/trace_recursion.h>
#include <linux/trace_events.h>
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
#include <linux/ring_buffer.h>
#include <linux/trace_clock.h>
#include <linux/sched/clock.h>
#include <linux/trace_seq.h>
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
#include <linux/spinlock.h>
#include <linux/irq_work.h>
#include <linux/security.h>
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
#include <linux/uaccess.h>
#include <linux/hardirq.h>
#include <linux/kthread.h> /* for self test */
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/mutex.h>
#include <linux/delay.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
#include <linux/slab.h>
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/list.h>
#include <linux/cpu.h>
#include <linux/oom.h>
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
#include <asm/local.h>
/*
* The "absolute" timestamp in the buffer is only 59 bits.
* If a clock has the 5 MSBs set, it needs to be saved and
* reinserted.
*/
#define TS_MSB (0xf8ULL << 56)
#define ABS_TS_MASK (~TS_MSB)
static void update_pages_handler(struct work_struct *work);
/*
* The ring buffer header is special. We must manually up keep it.
*/
int ring_buffer_print_entry_header(struct trace_seq *s)
{
trace_seq_puts(s, "# compressed entry header\n");
trace_seq_puts(s, "\ttype_len : 5 bits\n");
trace_seq_puts(s, "\ttime_delta : 27 bits\n");
trace_seq_puts(s, "\tarray : 32 bits\n");
trace_seq_putc(s, '\n');
trace_seq_printf(s, "\tpadding : type == %d\n",
RINGBUF_TYPE_PADDING);
trace_seq_printf(s, "\ttime_extend : type == %d\n",
RINGBUF_TYPE_TIME_EXTEND);
trace_seq_printf(s, "\ttime_stamp : type == %d\n",
RINGBUF_TYPE_TIME_STAMP);
trace_seq_printf(s, "\tdata max type_len == %d\n",
RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
return !trace_seq_has_overflowed(s);
}
/*
* The ring buffer is made up of a list of pages. A separate list of pages is
* allocated for each CPU. A writer may only write to a buffer that is
* associated with the CPU it is currently executing on. A reader may read
* from any per cpu buffer.
*
* The reader is special. For each per cpu buffer, the reader has its own
* reader page. When a reader has read the entire reader page, this reader
* page is swapped with another page in the ring buffer.
*
* Now, as long as the writer is off the reader page, the reader can do what
* ever it wants with that page. The writer will never write to that page
* again (as long as it is out of the ring buffer).
*
* Here's some silly ASCII art.
*
* +------+
* |reader| RING BUFFER
* |page |
* +------+ +---+ +---+ +---+
* | |-->| |-->| |
* +---+ +---+ +---+
* ^ |
* | |
* +---------------+
*
*
* +------+
* |reader| RING BUFFER
* |page |------------------v
* +------+ +---+ +---+ +---+
* | |-->| |-->| |
* +---+ +---+ +---+
* ^ |
* | |
* +---------------+
*
*
* +------+
* |reader| RING BUFFER
* |page |------------------v
* +------+ +---+ +---+ +---+
* ^ | |-->| |-->| |
* | +---+ +---+ +---+
* | |
* | |
* +------------------------------+
*
*
* +------+
* |buffer| RING BUFFER
* |page |------------------v
* +------+ +---+ +---+ +---+
* ^ | | | |-->| |
* | New +---+ +---+ +---+
* | Reader------^ |
* | page |
* +------------------------------+
*
*
* After we make this swap, the reader can hand this page off to the splice
* code and be done with it. It can even allocate a new page if it needs to
* and swap that into the ring buffer.
*
* We will be using cmpxchg soon to make all this lockless.
*
*/
/* Used for individual buffers (after the counter) */
#define RB_BUFFER_OFF (1 << 20)
ring-buffer: buffer record on/off switch Impact: enable/disable ring buffer recording API added Several kernel developers have requested that there be a way to stop recording into the ring buffers with a simple switch that can also be enabled from userspace. This patch addes a new kernel API to the ring buffers called: tracing_on() tracing_off() When tracing_off() is called, all ring buffers will not be able to record into their buffers. tracing_on() will enable the ring buffers again. These two act like an on/off switch. That is, there is no counting of the number of times tracing_off or tracing_on has been called. A new file is added to the debugfs/tracing directory called tracing_on This allows for userspace applications to also flip the switch. echo 0 > debugfs/tracing/tracing_on disables the tracing. echo 1 > /debugfs/tracing/tracing_on enables it. Note, this does not disable or enable any tracers. It only sets or clears a flag that needs to be set in order for the ring buffers to write to their buffers. It is a global flag, and affects all ring buffers. The buffers start out with tracing_on enabled. There are now three flags that control recording into the buffers: tracing_on: which affects all ring buffer tracers. buffer->record_disabled: which affects an allocated buffer, which may be set if an anomaly is detected, and tracing is disabled. cpu_buffer->record_disabled: which is set by tracing_stop() or if an anomaly is detected. tracing_start can not reenable this if an anomaly occurred. The userspace debugfs/tracing/tracing_enabled is implemented with tracing_stop() but the user space code can not enable it if the kernel called tracing_stop(). Userspace can enable the tracing_on even if the kernel disabled it. It is just a switch used to stop tracing if a condition was hit. tracing_on is not for protecting critical areas in the kernel nor is it for stopping tracing if an anomaly occurred. This is because userspace can reenable it at any time. Side effect: With this patch, I discovered a dead variable in ftrace.c called tracing_on. This patch removes it. Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2008-11-12 04:01:42 +08:00
#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
#define RB_ALIGNMENT 4U
#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS
# define RB_FORCE_8BYTE_ALIGNMENT 0
# define RB_ARCH_ALIGNMENT RB_ALIGNMENT
#else
# define RB_FORCE_8BYTE_ALIGNMENT 1
# define RB_ARCH_ALIGNMENT 8U
#endif
#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT)
/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
enum {
RB_LEN_TIME_EXTEND = 8,
RB_LEN_TIME_STAMP = 8,
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
};
ring-buffer: Bind time extend and data events together When the time between two timestamps is greater than 2^27 nanosecs (~134 ms) a time extend event is added that extends the time difference to 59 bits (~18 years). This is due to events only having a 27 bit field to store time. Currently this time extend is a separate event. We add it just before the event data that is being written to the buffer. But before the event data is committed, the event data can also be discarded (as with the case of filters). But because the time extend has already been committed, it will stay in the buffer. If lots of events are being filtered and no event is being written, then every 134ms a time extend can be added to the buffer without any data attached. To keep from filling the entire buffer with time extends, a time extend will never be the first event in a page because the page timestamp can be used. Time extends can only fill the rest of a page with some data at the beginning. This patch binds the time extend with the data. The difference here is that the time extend is not committed before the data is added. Instead, when a time extend is needed, the space reserved on the ring buffer is the time extend + the data event size. The time extend is added to the first part of the reserved block and the data is added to the second. The time extend event is passed back to the reserver, but since the reserver also uses a function to find the data portion of the reserved block, no changes to the ring buffer interface need to be made. When a commit is discarded, we now remove both the time extend and the event. With this approach no more than one time extend can be in the buffer in a row. Data must always follow a time extend. Thanks to Mathieu Desnoyers for suggesting this idea. Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-10-08 06:18:05 +08:00
#define skip_time_extend(event) \
((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
#define extended_time(event) \
(event->type_len >= RINGBUF_TYPE_TIME_EXTEND)
static inline bool rb_null_event(struct ring_buffer_event *event)
{
return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
}
static void rb_event_set_padding(struct ring_buffer_event *event)
{
/* padding has a NULL time_delta */
event->type_len = RINGBUF_TYPE_PADDING;
event->time_delta = 0;
}
static unsigned
rb_event_data_length(struct ring_buffer_event *event)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
unsigned length;
if (event->type_len)
length = event->type_len * RB_ALIGNMENT;
else
length = event->array[0];
return length + RB_EVNT_HDR_SIZE;
}
ring-buffer: Bind time extend and data events together When the time between two timestamps is greater than 2^27 nanosecs (~134 ms) a time extend event is added that extends the time difference to 59 bits (~18 years). This is due to events only having a 27 bit field to store time. Currently this time extend is a separate event. We add it just before the event data that is being written to the buffer. But before the event data is committed, the event data can also be discarded (as with the case of filters). But because the time extend has already been committed, it will stay in the buffer. If lots of events are being filtered and no event is being written, then every 134ms a time extend can be added to the buffer without any data attached. To keep from filling the entire buffer with time extends, a time extend will never be the first event in a page because the page timestamp can be used. Time extends can only fill the rest of a page with some data at the beginning. This patch binds the time extend with the data. The difference here is that the time extend is not committed before the data is added. Instead, when a time extend is needed, the space reserved on the ring buffer is the time extend + the data event size. The time extend is added to the first part of the reserved block and the data is added to the second. The time extend event is passed back to the reserver, but since the reserver also uses a function to find the data portion of the reserved block, no changes to the ring buffer interface need to be made. When a commit is discarded, we now remove both the time extend and the event. With this approach no more than one time extend can be in the buffer in a row. Data must always follow a time extend. Thanks to Mathieu Desnoyers for suggesting this idea. Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-10-08 06:18:05 +08:00
/*
* Return the length of the given event. Will return
* the length of the time extend if the event is a
* time extend.
*/
static inline unsigned
rb_event_length(struct ring_buffer_event *event)
{
switch (event->type_len) {
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
case RINGBUF_TYPE_PADDING:
if (rb_null_event(event))
/* undefined */
return -1;
return event->array[0] + RB_EVNT_HDR_SIZE;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
case RINGBUF_TYPE_TIME_EXTEND:
return RB_LEN_TIME_EXTEND;
case RINGBUF_TYPE_TIME_STAMP:
return RB_LEN_TIME_STAMP;
case RINGBUF_TYPE_DATA:
return rb_event_data_length(event);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
default:
WARN_ON_ONCE(1);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
/* not hit */
return 0;
}
ring-buffer: Bind time extend and data events together When the time between two timestamps is greater than 2^27 nanosecs (~134 ms) a time extend event is added that extends the time difference to 59 bits (~18 years). This is due to events only having a 27 bit field to store time. Currently this time extend is a separate event. We add it just before the event data that is being written to the buffer. But before the event data is committed, the event data can also be discarded (as with the case of filters). But because the time extend has already been committed, it will stay in the buffer. If lots of events are being filtered and no event is being written, then every 134ms a time extend can be added to the buffer without any data attached. To keep from filling the entire buffer with time extends, a time extend will never be the first event in a page because the page timestamp can be used. Time extends can only fill the rest of a page with some data at the beginning. This patch binds the time extend with the data. The difference here is that the time extend is not committed before the data is added. Instead, when a time extend is needed, the space reserved on the ring buffer is the time extend + the data event size. The time extend is added to the first part of the reserved block and the data is added to the second. The time extend event is passed back to the reserver, but since the reserver also uses a function to find the data portion of the reserved block, no changes to the ring buffer interface need to be made. When a commit is discarded, we now remove both the time extend and the event. With this approach no more than one time extend can be in the buffer in a row. Data must always follow a time extend. Thanks to Mathieu Desnoyers for suggesting this idea. Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-10-08 06:18:05 +08:00
/*
* Return total length of time extend and data,
* or just the event length for all other events.
*/
static inline unsigned
rb_event_ts_length(struct ring_buffer_event *event)
{
unsigned len = 0;
if (extended_time(event)) {
ring-buffer: Bind time extend and data events together When the time between two timestamps is greater than 2^27 nanosecs (~134 ms) a time extend event is added that extends the time difference to 59 bits (~18 years). This is due to events only having a 27 bit field to store time. Currently this time extend is a separate event. We add it just before the event data that is being written to the buffer. But before the event data is committed, the event data can also be discarded (as with the case of filters). But because the time extend has already been committed, it will stay in the buffer. If lots of events are being filtered and no event is being written, then every 134ms a time extend can be added to the buffer without any data attached. To keep from filling the entire buffer with time extends, a time extend will never be the first event in a page because the page timestamp can be used. Time extends can only fill the rest of a page with some data at the beginning. This patch binds the time extend with the data. The difference here is that the time extend is not committed before the data is added. Instead, when a time extend is needed, the space reserved on the ring buffer is the time extend + the data event size. The time extend is added to the first part of the reserved block and the data is added to the second. The time extend event is passed back to the reserver, but since the reserver also uses a function to find the data portion of the reserved block, no changes to the ring buffer interface need to be made. When a commit is discarded, we now remove both the time extend and the event. With this approach no more than one time extend can be in the buffer in a row. Data must always follow a time extend. Thanks to Mathieu Desnoyers for suggesting this idea. Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-10-08 06:18:05 +08:00
/* time extends include the data event after it */
len = RB_LEN_TIME_EXTEND;
event = skip_time_extend(event);
}
return len + rb_event_length(event);
}
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/**
* ring_buffer_event_length - return the length of the event
* @event: the event to get the length of
ring-buffer: Bind time extend and data events together When the time between two timestamps is greater than 2^27 nanosecs (~134 ms) a time extend event is added that extends the time difference to 59 bits (~18 years). This is due to events only having a 27 bit field to store time. Currently this time extend is a separate event. We add it just before the event data that is being written to the buffer. But before the event data is committed, the event data can also be discarded (as with the case of filters). But because the time extend has already been committed, it will stay in the buffer. If lots of events are being filtered and no event is being written, then every 134ms a time extend can be added to the buffer without any data attached. To keep from filling the entire buffer with time extends, a time extend will never be the first event in a page because the page timestamp can be used. Time extends can only fill the rest of a page with some data at the beginning. This patch binds the time extend with the data. The difference here is that the time extend is not committed before the data is added. Instead, when a time extend is needed, the space reserved on the ring buffer is the time extend + the data event size. The time extend is added to the first part of the reserved block and the data is added to the second. The time extend event is passed back to the reserver, but since the reserver also uses a function to find the data portion of the reserved block, no changes to the ring buffer interface need to be made. When a commit is discarded, we now remove both the time extend and the event. With this approach no more than one time extend can be in the buffer in a row. Data must always follow a time extend. Thanks to Mathieu Desnoyers for suggesting this idea. Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-10-08 06:18:05 +08:00
*
* Returns the size of the data load of a data event.
* If the event is something other than a data event, it
* returns the size of the event itself. With the exception
* of a TIME EXTEND, where it still returns the size of the
* data load of the data event after it.
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
*/
unsigned ring_buffer_event_length(struct ring_buffer_event *event)
{
ring-buffer: Bind time extend and data events together When the time between two timestamps is greater than 2^27 nanosecs (~134 ms) a time extend event is added that extends the time difference to 59 bits (~18 years). This is due to events only having a 27 bit field to store time. Currently this time extend is a separate event. We add it just before the event data that is being written to the buffer. But before the event data is committed, the event data can also be discarded (as with the case of filters). But because the time extend has already been committed, it will stay in the buffer. If lots of events are being filtered and no event is being written, then every 134ms a time extend can be added to the buffer without any data attached. To keep from filling the entire buffer with time extends, a time extend will never be the first event in a page because the page timestamp can be used. Time extends can only fill the rest of a page with some data at the beginning. This patch binds the time extend with the data. The difference here is that the time extend is not committed before the data is added. Instead, when a time extend is needed, the space reserved on the ring buffer is the time extend + the data event size. The time extend is added to the first part of the reserved block and the data is added to the second. The time extend event is passed back to the reserver, but since the reserver also uses a function to find the data portion of the reserved block, no changes to the ring buffer interface need to be made. When a commit is discarded, we now remove both the time extend and the event. With this approach no more than one time extend can be in the buffer in a row. Data must always follow a time extend. Thanks to Mathieu Desnoyers for suggesting this idea. Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-10-08 06:18:05 +08:00
unsigned length;
if (extended_time(event))
ring-buffer: Bind time extend and data events together When the time between two timestamps is greater than 2^27 nanosecs (~134 ms) a time extend event is added that extends the time difference to 59 bits (~18 years). This is due to events only having a 27 bit field to store time. Currently this time extend is a separate event. We add it just before the event data that is being written to the buffer. But before the event data is committed, the event data can also be discarded (as with the case of filters). But because the time extend has already been committed, it will stay in the buffer. If lots of events are being filtered and no event is being written, then every 134ms a time extend can be added to the buffer without any data attached. To keep from filling the entire buffer with time extends, a time extend will never be the first event in a page because the page timestamp can be used. Time extends can only fill the rest of a page with some data at the beginning. This patch binds the time extend with the data. The difference here is that the time extend is not committed before the data is added. Instead, when a time extend is needed, the space reserved on the ring buffer is the time extend + the data event size. The time extend is added to the first part of the reserved block and the data is added to the second. The time extend event is passed back to the reserver, but since the reserver also uses a function to find the data portion of the reserved block, no changes to the ring buffer interface need to be made. When a commit is discarded, we now remove both the time extend and the event. With this approach no more than one time extend can be in the buffer in a row. Data must always follow a time extend. Thanks to Mathieu Desnoyers for suggesting this idea. Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-10-08 06:18:05 +08:00
event = skip_time_extend(event);
length = rb_event_length(event);
if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
return length;
length -= RB_EVNT_HDR_SIZE;
if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0]))
length -= sizeof(event->array[0]);
return length;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
EXPORT_SYMBOL_GPL(ring_buffer_event_length);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/* inline for ring buffer fast paths */
static __always_inline void *
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
rb_event_data(struct ring_buffer_event *event)
{
if (extended_time(event))
ring-buffer: Bind time extend and data events together When the time between two timestamps is greater than 2^27 nanosecs (~134 ms) a time extend event is added that extends the time difference to 59 bits (~18 years). This is due to events only having a 27 bit field to store time. Currently this time extend is a separate event. We add it just before the event data that is being written to the buffer. But before the event data is committed, the event data can also be discarded (as with the case of filters). But because the time extend has already been committed, it will stay in the buffer. If lots of events are being filtered and no event is being written, then every 134ms a time extend can be added to the buffer without any data attached. To keep from filling the entire buffer with time extends, a time extend will never be the first event in a page because the page timestamp can be used. Time extends can only fill the rest of a page with some data at the beginning. This patch binds the time extend with the data. The difference here is that the time extend is not committed before the data is added. Instead, when a time extend is needed, the space reserved on the ring buffer is the time extend + the data event size. The time extend is added to the first part of the reserved block and the data is added to the second. The time extend event is passed back to the reserver, but since the reserver also uses a function to find the data portion of the reserved block, no changes to the ring buffer interface need to be made. When a commit is discarded, we now remove both the time extend and the event. With this approach no more than one time extend can be in the buffer in a row. Data must always follow a time extend. Thanks to Mathieu Desnoyers for suggesting this idea. Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-10-08 06:18:05 +08:00
event = skip_time_extend(event);
WARN_ON_ONCE(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/* If length is in len field, then array[0] has the data */
if (event->type_len)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
return (void *)&event->array[0];
/* Otherwise length is in array[0] and array[1] has the data */
return (void *)&event->array[1];
}
/**
* ring_buffer_event_data - return the data of the event
* @event: the event to get the data from
*/
void *ring_buffer_event_data(struct ring_buffer_event *event)
{
return rb_event_data(event);
}
EXPORT_SYMBOL_GPL(ring_buffer_event_data);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
#define for_each_buffer_cpu(buffer, cpu) \
for_each_cpu(cpu, buffer->cpumask)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
#define for_each_online_buffer_cpu(buffer, cpu) \
for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
#define TS_SHIFT 27
#define TS_MASK ((1ULL << TS_SHIFT) - 1)
#define TS_DELTA_TEST (~TS_MASK)
static u64 rb_event_time_stamp(struct ring_buffer_event *event)
{
u64 ts;
ts = event->array[0];
ts <<= TS_SHIFT;
ts += event->time_delta;
return ts;
}
ring-buffer: Add place holder recording of dropped events Currently, when the ring buffer drops events, it does not record the fact that it did so. It does inform the writer that the event was dropped by returning a NULL event, but it does not put in any place holder where the event was dropped. This is not a trivial thing to add because the ring buffer mostly runs in overwrite (flight recorder) mode. That is, when the ring buffer is full, new data will overwrite old data. In a produce/consumer mode, where new data is simply dropped when the ring buffer is full, it is trivial to add the placeholder for dropped events. When there's more room to write new data, then a special event can be added to notify the reader about the dropped events. But in overwrite mode, any new write can overwrite events. A place holder can not be inserted into the ring buffer since there never may be room. A reader could also come in at anytime and miss the placeholder. Luckily, the way the ring buffer works, the read side can find out if events were lost or not, and how many events. Everytime a write takes place, if it overwrites the header page (the next read) it updates a "overrun" variable that keeps track of the number of lost events. When a reader swaps out a page from the ring buffer, it can record this number, perfom the swap, and then check to see if the number changed, and take the diff if it has, which would be the number of events dropped. This can be stored by the reader and returned to callers of the reader. Since the reader page swap will fail if the writer moved the head page since the time the reader page set up the swap, this gives room to record the overruns without worrying about races. If the reader sets up the pages, records the overrun, than performs the swap, if the swap succeeds, then the overrun variable has not been updated since the setup before the swap. For binary readers of the ring buffer, a flag is set in the header of each sub page (sub buffer) of the ring buffer. This flag is embedded in the size field of the data on the sub buffer, in the 31st bit (the size can be 32 or 64 bits depending on the architecture), but only 27 bits needs to be used for the actual size (less actually). We could add a new field in the sub buffer header to also record the number of events dropped since the last read, but this will change the format of the binary ring buffer a bit too much. Perhaps this change can be made if the information on the number of events dropped is considered important enough. Note, the notification of dropped events is only used by consuming reads or peeking at the ring buffer. Iterating over the ring buffer does not keep this information because the necessary data is only available when a page swap is made, and the iterator does not swap out pages. Cc: Robert Richter <robert.richter@amd.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: "Luis Claudio R. Goncalves" <lclaudio@uudg.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-04-01 01:21:56 +08:00
/* Flag when events were overwritten */
#define RB_MISSED_EVENTS (1 << 31)
/* Missed count stored at end */
#define RB_MISSED_STORED (1 << 30)
ring-buffer: Add place holder recording of dropped events Currently, when the ring buffer drops events, it does not record the fact that it did so. It does inform the writer that the event was dropped by returning a NULL event, but it does not put in any place holder where the event was dropped. This is not a trivial thing to add because the ring buffer mostly runs in overwrite (flight recorder) mode. That is, when the ring buffer is full, new data will overwrite old data. In a produce/consumer mode, where new data is simply dropped when the ring buffer is full, it is trivial to add the placeholder for dropped events. When there's more room to write new data, then a special event can be added to notify the reader about the dropped events. But in overwrite mode, any new write can overwrite events. A place holder can not be inserted into the ring buffer since there never may be room. A reader could also come in at anytime and miss the placeholder. Luckily, the way the ring buffer works, the read side can find out if events were lost or not, and how many events. Everytime a write takes place, if it overwrites the header page (the next read) it updates a "overrun" variable that keeps track of the number of lost events. When a reader swaps out a page from the ring buffer, it can record this number, perfom the swap, and then check to see if the number changed, and take the diff if it has, which would be the number of events dropped. This can be stored by the reader and returned to callers of the reader. Since the reader page swap will fail if the writer moved the head page since the time the reader page set up the swap, this gives room to record the overruns without worrying about races. If the reader sets up the pages, records the overrun, than performs the swap, if the swap succeeds, then the overrun variable has not been updated since the setup before the swap. For binary readers of the ring buffer, a flag is set in the header of each sub page (sub buffer) of the ring buffer. This flag is embedded in the size field of the data on the sub buffer, in the 31st bit (the size can be 32 or 64 bits depending on the architecture), but only 27 bits needs to be used for the actual size (less actually). We could add a new field in the sub buffer header to also record the number of events dropped since the last read, but this will change the format of the binary ring buffer a bit too much. Perhaps this change can be made if the information on the number of events dropped is considered important enough. Note, the notification of dropped events is only used by consuming reads or peeking at the ring buffer. Iterating over the ring buffer does not keep this information because the necessary data is only available when a page swap is made, and the iterator does not swap out pages. Cc: Robert Richter <robert.richter@amd.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: "Luis Claudio R. Goncalves" <lclaudio@uudg.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-04-01 01:21:56 +08:00
struct buffer_data_page {
u64 time_stamp; /* page time stamp */
local_t commit; /* write committed index */
unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */
};
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
/*
* Note, the buffer_page list must be first. The buffer pages
* are allocated in cache lines, which means that each buffer
* page will be at the beginning of a cache line, and thus
* the least significant bits will be zero. We use this to
* add flags in the list struct pointers, to make the ring buffer
* lockless.
*/
struct buffer_page {
struct list_head list; /* list of buffer pages */
local_t write; /* index for next write */
unsigned read; /* index for next read */
local_t entries; /* entries on this page */
unsigned long real_end; /* real end of data */
struct buffer_data_page *page; /* Actual data page */
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
};
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
/*
* The buffer page counters, write and entries, must be reset
* atomically when crossing page boundaries. To synchronize this
* update, two counters are inserted into the number. One is
* the actual counter for the write position or count on the page.
*
* The other is a counter of updaters. Before an update happens
* the update partition of the counter is incremented. This will
* allow the updater to update the counter atomically.
*
* The counter is 20 bits, and the state data is 12.
*/
#define RB_WRITE_MASK 0xfffff
#define RB_WRITE_INTCNT (1 << 20)
static void rb_init_page(struct buffer_data_page *bpage)
{
local_set(&bpage->commit, 0);
}
static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage)
{
return local_read(&bpage->page->commit);
}
static void free_buffer_page(struct buffer_page *bpage)
{
free_page((unsigned long)bpage->page);
kfree(bpage);
}
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/*
* We need to fit the time_stamp delta into 27 bits.
*/
static inline bool test_time_stamp(u64 delta)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
return !!(delta & TS_DELTA_TEST);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
int ring_buffer_print_page_header(struct trace_seq *s)
{
struct buffer_data_page field;
trace_seq_printf(s, "\tfield: u64 timestamp;\t"
"offset:0;\tsize:%u;\tsigned:%u;\n",
(unsigned int)sizeof(field.time_stamp),
(unsigned int)is_signed_type(u64));
trace_seq_printf(s, "\tfield: local_t commit;\t"
"offset:%u;\tsize:%u;\tsigned:%u;\n",
(unsigned int)offsetof(typeof(field), commit),
(unsigned int)sizeof(field.commit),
(unsigned int)is_signed_type(long));
trace_seq_printf(s, "\tfield: int overwrite;\t"
"offset:%u;\tsize:%u;\tsigned:%u;\n",
(unsigned int)offsetof(typeof(field), commit),
1,
(unsigned int)is_signed_type(long));
trace_seq_printf(s, "\tfield: char data;\t"
"offset:%u;\tsize:%u;\tsigned:%u;\n",
(unsigned int)offsetof(typeof(field), data),
(unsigned int)BUF_PAGE_SIZE,
(unsigned int)is_signed_type(char));
return !trace_seq_has_overflowed(s);
}
struct rb_irq_work {
struct irq_work work;
wait_queue_head_t waiters;
ring-buffer: Do not wake up a splice waiter when page is not full When an application connects to the ring buffer via splice, it can only read full pages. Splice does not work with partial pages. If there is not enough data to fill a page, the splice command will either block or return -EAGAIN (if set to nonblock). Code was added where if the page is not full, to just sleep again. The problem is, it will get woken up again on the next event. That is, when something is written into the ring buffer, if there is a waiter it will wake it up. The waiter would then check the buffer, see that it still does not have enough data to fill a page and go back to sleep. To make matters worse, when the waiter goes back to sleep, it could cause another event, which would wake it back up again to see it doesn't have enough data and sleep again. This produces a tremendous overhead and fills the ring buffer with noise. For example, recording sched_switch on an idle system for 10 seconds produces 25,350,475 events!!! Create another wait queue for those waiters wanting full pages. When an event is written, it only wakes up waiters if there's a full page of data. It does not wake up the waiter if the page is not yet full. After this change, recording sched_switch on an idle system for 10 seconds produces only 800 events. Getting rid of 25,349,675 useless events (99.9969% of events!!), is something to take seriously. Cc: stable@vger.kernel.org # 3.16+ Cc: Rabin Vincent <rabin@rab.in> Fixes: e30f53aad220 "tracing: Do not busy wait in buffer splice" Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-02-11 11:14:53 +08:00
wait_queue_head_t full_waiters;
long wait_index;
bool waiters_pending;
ring-buffer: Do not wake up a splice waiter when page is not full When an application connects to the ring buffer via splice, it can only read full pages. Splice does not work with partial pages. If there is not enough data to fill a page, the splice command will either block or return -EAGAIN (if set to nonblock). Code was added where if the page is not full, to just sleep again. The problem is, it will get woken up again on the next event. That is, when something is written into the ring buffer, if there is a waiter it will wake it up. The waiter would then check the buffer, see that it still does not have enough data to fill a page and go back to sleep. To make matters worse, when the waiter goes back to sleep, it could cause another event, which would wake it back up again to see it doesn't have enough data and sleep again. This produces a tremendous overhead and fills the ring buffer with noise. For example, recording sched_switch on an idle system for 10 seconds produces 25,350,475 events!!! Create another wait queue for those waiters wanting full pages. When an event is written, it only wakes up waiters if there's a full page of data. It does not wake up the waiter if the page is not yet full. After this change, recording sched_switch on an idle system for 10 seconds produces only 800 events. Getting rid of 25,349,675 useless events (99.9969% of events!!), is something to take seriously. Cc: stable@vger.kernel.org # 3.16+ Cc: Rabin Vincent <rabin@rab.in> Fixes: e30f53aad220 "tracing: Do not busy wait in buffer splice" Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-02-11 11:14:53 +08:00
bool full_waiters_pending;
bool wakeup_full;
};
/*
* Structure to hold event state and handle nested events.
*/
struct rb_event_info {
u64 ts;
u64 delta;
u64 before;
u64 after;
unsigned long length;
struct buffer_page *tail_page;
int add_timestamp;
};
ring-buffer: Have nested events still record running time stamp Up until now, if an event is interrupted while it is recorded by an interrupt, and that interrupt records events, the time of those events will all be the same. This is because events only record the delta of the time since the previous event (or beginning of a page), and to handle updating the time keeping for that of nested events is extremely racy. After years of thinking about this and several failed attempts, I finally have a solution to solve this puzzle. The problem is that you need to atomically calculate the delta and then update the time stamp you made the delta from, as well as then record it into the buffer, all this while at any time an interrupt can come in and do the same thing. This is easy to solve with heavy weight atomics, but that would be detrimental to the performance of the ring buffer. The current state of affairs sacrificed the time deltas for nested events for performance. The reason for previous failed attempts at solving this puzzle was because I was trying to completely avoid slow atomic operations like cmpxchg. I final came to the conclusion to always avoid cmpxchg is not possible, which is why those previous attempts always failed. But it is possible to pick one path (the most common case) and avoid cmpxchg in that path, which is the "fast path". The most common case is that an event will not be interrupted and have other events added into it. An event can detect if it has interrupted another event, and for these cases we can make it the slow path and use the heavy operations like cmpxchg. One more player was added to the game that made this possible, and that is the "absolute timestamp" (by Tom Zanussi) that allows us to inject a full 59 bit time stamp. (Of course this breaks if a machine is running for more than 18 years without a reboot!). There's barrier() placements around for being paranoid, even when they are not needed because of other atomic functions near by. But those should not hurt, as if they are not needed, they basically become a nop. Note, this also makes the race window much smaller, which means there are less slow paths to slow down the performance. The basic idea is that there's two main paths taken. 1) Not being interrupted between time stamps and reserving buffer space. In this case, the time stamps taken are true to the location in the buffer. 2) Was interrupted by another path between taking time stamps and reserving buffer space. The objective is to know what the delta is from the last reserved location in the buffer. As it is possible to detect if an event is interrupting another event before reserving data, space is added to the length to be reserved to inject a full time stamp along with the event being reserved. When an event is not interrupted, the write stamp is always the time of the last event written to the buffer. In path 1, there's two sub paths we care about: a) The event did not interrupt another event. b) The event interrupted another event. In case a, as the write stamp was read and known to be correct, the delta between the current time stamp and the write stamp is the delta between the current event and the previously recorded event. In case b, extra space was reserved to just put the full time stamp into the buffer. Which is done, as stated, in this path the time stamp taken is known to match the location in the buffer. In path 2, there's also two sub paths we care about: a) The event was not interrupted by another event since it reserved space on the buffer and re-reading the write stamp. b) The event was interrupted by another event. In case a, the write stamp is that of the last event that interrupted this event between taking the time stamps and reserving. As no event came in after re-reading the write stamp, that event is known to be the time of the event directly before this event and the delta can be the new time stamp and the write stamp. In case b, one or more events came in between reserving the event and re-reading he write stamp. Since this event's buffer reservation is between other events at this path, there's no way to know what the delta is. But because an event interrupted this event after it started, its fine to just give a zero delta, and take the same time stamp as the events that happened within the event being recorded. Here's the implementation of the design of this solution: All this is per cpu, and only needs to worry about nested events (not parallel events). The players: write_tail: The index in the buffer where new events can be written to. It is incremented via local_add() to reserve space for a new event. before_stamp: A time stamp set by all events before reserving space. write_stamp: A time stamp updated by events after it has successfully reserved space. /* Save the current position of write */ [A] w = local_read(write_tail); barrier(); /* Read both before and write stamps before touching anything */ before = local_read(before_stamp); after = local_read(write_stamp); barrier(); /* * If before and after are the same, then this event is not * interrupting a time update. If it is, then reserve space for adding * a full time stamp (this can turn into a time extend which is * just an extended time delta but fill up the extra space). */ if (after != before) abs = true; ts = clock(); /* Now update the before_stamp (everyone does this!) */ [B] local_set(before_stamp, ts); /* Now reserve space on the buffer */ [C] write = local_add_return(len, write_tail); /* Set tail to be were this event's data is */ tail = write - len; if (w == tail) { /* Nothing interrupted this between A and C */ [D] local_set(write_stamp, ts); barrier(); [E] save_before = local_read(before_stamp); if (!abs) { /* This did not interrupt a time update */ delta = ts - after; } else { delta = ts; /* The full time stamp will be in use */ } if (ts != save_before) { /* slow path - Was interrupted between C and E */ /* The update to write_stamp could have overwritten the update to * it by the interrupting event, but before and after should be * the same for all completed top events */ after = local_read(write_stamp); if (save_before > after) local_cmpxchg(write_stamp, after, save_before); } } else { /* slow path - Interrupted between A and C */ after = local_read(write_stamp); temp_ts = clock(); barrier(); [F] if (write == local_read(write_tail) && after < temp_ts) { /* This was not interrupted since C and F * The last write_stamp is still valid for the previous event * in the buffer. */ delta = temp_ts - after; /* OK to keep this new time stamp */ ts = temp_ts; } else { /* Interrupted between C and F * Well, there's no use to try to know what the time stamp * is for the previous event. Just set delta to zero and * be the same time as that event that interrupted us before * the reservation of the buffer. */ delta = 0; } /* No need to use full timestamps here */ abs = 0; } Link: https://lkml.kernel.org/r/20200625094454.732790f7@oasis.local.home Link: https://lore.kernel.org/r/20200627010041.517736087@goodmis.org Link: http://lkml.kernel.org/r/20200629025258.957440797@goodmis.org Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:25 +08:00
/*
* Used for the add_timestamp
* NONE
* EXTEND - wants a time extend
* ABSOLUTE - the buffer requests all events to have absolute time stamps
ring-buffer: Have nested events still record running time stamp Up until now, if an event is interrupted while it is recorded by an interrupt, and that interrupt records events, the time of those events will all be the same. This is because events only record the delta of the time since the previous event (or beginning of a page), and to handle updating the time keeping for that of nested events is extremely racy. After years of thinking about this and several failed attempts, I finally have a solution to solve this puzzle. The problem is that you need to atomically calculate the delta and then update the time stamp you made the delta from, as well as then record it into the buffer, all this while at any time an interrupt can come in and do the same thing. This is easy to solve with heavy weight atomics, but that would be detrimental to the performance of the ring buffer. The current state of affairs sacrificed the time deltas for nested events for performance. The reason for previous failed attempts at solving this puzzle was because I was trying to completely avoid slow atomic operations like cmpxchg. I final came to the conclusion to always avoid cmpxchg is not possible, which is why those previous attempts always failed. But it is possible to pick one path (the most common case) and avoid cmpxchg in that path, which is the "fast path". The most common case is that an event will not be interrupted and have other events added into it. An event can detect if it has interrupted another event, and for these cases we can make it the slow path and use the heavy operations like cmpxchg. One more player was added to the game that made this possible, and that is the "absolute timestamp" (by Tom Zanussi) that allows us to inject a full 59 bit time stamp. (Of course this breaks if a machine is running for more than 18 years without a reboot!). There's barrier() placements around for being paranoid, even when they are not needed because of other atomic functions near by. But those should not hurt, as if they are not needed, they basically become a nop. Note, this also makes the race window much smaller, which means there are less slow paths to slow down the performance. The basic idea is that there's two main paths taken. 1) Not being interrupted between time stamps and reserving buffer space. In this case, the time stamps taken are true to the location in the buffer. 2) Was interrupted by another path between taking time stamps and reserving buffer space. The objective is to know what the delta is from the last reserved location in the buffer. As it is possible to detect if an event is interrupting another event before reserving data, space is added to the length to be reserved to inject a full time stamp along with the event being reserved. When an event is not interrupted, the write stamp is always the time of the last event written to the buffer. In path 1, there's two sub paths we care about: a) The event did not interrupt another event. b) The event interrupted another event. In case a, as the write stamp was read and known to be correct, the delta between the current time stamp and the write stamp is the delta between the current event and the previously recorded event. In case b, extra space was reserved to just put the full time stamp into the buffer. Which is done, as stated, in this path the time stamp taken is known to match the location in the buffer. In path 2, there's also two sub paths we care about: a) The event was not interrupted by another event since it reserved space on the buffer and re-reading the write stamp. b) The event was interrupted by another event. In case a, the write stamp is that of the last event that interrupted this event between taking the time stamps and reserving. As no event came in after re-reading the write stamp, that event is known to be the time of the event directly before this event and the delta can be the new time stamp and the write stamp. In case b, one or more events came in between reserving the event and re-reading he write stamp. Since this event's buffer reservation is between other events at this path, there's no way to know what the delta is. But because an event interrupted this event after it started, its fine to just give a zero delta, and take the same time stamp as the events that happened within the event being recorded. Here's the implementation of the design of this solution: All this is per cpu, and only needs to worry about nested events (not parallel events). The players: write_tail: The index in the buffer where new events can be written to. It is incremented via local_add() to reserve space for a new event. before_stamp: A time stamp set by all events before reserving space. write_stamp: A time stamp updated by events after it has successfully reserved space. /* Save the current position of write */ [A] w = local_read(write_tail); barrier(); /* Read both before and write stamps before touching anything */ before = local_read(before_stamp); after = local_read(write_stamp); barrier(); /* * If before and after are the same, then this event is not * interrupting a time update. If it is, then reserve space for adding * a full time stamp (this can turn into a time extend which is * just an extended time delta but fill up the extra space). */ if (after != before) abs = true; ts = clock(); /* Now update the before_stamp (everyone does this!) */ [B] local_set(before_stamp, ts); /* Now reserve space on the buffer */ [C] write = local_add_return(len, write_tail); /* Set tail to be were this event's data is */ tail = write - len; if (w == tail) { /* Nothing interrupted this between A and C */ [D] local_set(write_stamp, ts); barrier(); [E] save_before = local_read(before_stamp); if (!abs) { /* This did not interrupt a time update */ delta = ts - after; } else { delta = ts; /* The full time stamp will be in use */ } if (ts != save_before) { /* slow path - Was interrupted between C and E */ /* The update to write_stamp could have overwritten the update to * it by the interrupting event, but before and after should be * the same for all completed top events */ after = local_read(write_stamp); if (save_before > after) local_cmpxchg(write_stamp, after, save_before); } } else { /* slow path - Interrupted between A and C */ after = local_read(write_stamp); temp_ts = clock(); barrier(); [F] if (write == local_read(write_tail) && after < temp_ts) { /* This was not interrupted since C and F * The last write_stamp is still valid for the previous event * in the buffer. */ delta = temp_ts - after; /* OK to keep this new time stamp */ ts = temp_ts; } else { /* Interrupted between C and F * Well, there's no use to try to know what the time stamp * is for the previous event. Just set delta to zero and * be the same time as that event that interrupted us before * the reservation of the buffer. */ delta = 0; } /* No need to use full timestamps here */ abs = 0; } Link: https://lkml.kernel.org/r/20200625094454.732790f7@oasis.local.home Link: https://lore.kernel.org/r/20200627010041.517736087@goodmis.org Link: http://lkml.kernel.org/r/20200629025258.957440797@goodmis.org Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:25 +08:00
* FORCE - force a full time stamp.
*/
enum {
RB_ADD_STAMP_NONE = 0,
RB_ADD_STAMP_EXTEND = BIT(1),
RB_ADD_STAMP_ABSOLUTE = BIT(2),
RB_ADD_STAMP_FORCE = BIT(3)
ring-buffer: Have nested events still record running time stamp Up until now, if an event is interrupted while it is recorded by an interrupt, and that interrupt records events, the time of those events will all be the same. This is because events only record the delta of the time since the previous event (or beginning of a page), and to handle updating the time keeping for that of nested events is extremely racy. After years of thinking about this and several failed attempts, I finally have a solution to solve this puzzle. The problem is that you need to atomically calculate the delta and then update the time stamp you made the delta from, as well as then record it into the buffer, all this while at any time an interrupt can come in and do the same thing. This is easy to solve with heavy weight atomics, but that would be detrimental to the performance of the ring buffer. The current state of affairs sacrificed the time deltas for nested events for performance. The reason for previous failed attempts at solving this puzzle was because I was trying to completely avoid slow atomic operations like cmpxchg. I final came to the conclusion to always avoid cmpxchg is not possible, which is why those previous attempts always failed. But it is possible to pick one path (the most common case) and avoid cmpxchg in that path, which is the "fast path". The most common case is that an event will not be interrupted and have other events added into it. An event can detect if it has interrupted another event, and for these cases we can make it the slow path and use the heavy operations like cmpxchg. One more player was added to the game that made this possible, and that is the "absolute timestamp" (by Tom Zanussi) that allows us to inject a full 59 bit time stamp. (Of course this breaks if a machine is running for more than 18 years without a reboot!). There's barrier() placements around for being paranoid, even when they are not needed because of other atomic functions near by. But those should not hurt, as if they are not needed, they basically become a nop. Note, this also makes the race window much smaller, which means there are less slow paths to slow down the performance. The basic idea is that there's two main paths taken. 1) Not being interrupted between time stamps and reserving buffer space. In this case, the time stamps taken are true to the location in the buffer. 2) Was interrupted by another path between taking time stamps and reserving buffer space. The objective is to know what the delta is from the last reserved location in the buffer. As it is possible to detect if an event is interrupting another event before reserving data, space is added to the length to be reserved to inject a full time stamp along with the event being reserved. When an event is not interrupted, the write stamp is always the time of the last event written to the buffer. In path 1, there's two sub paths we care about: a) The event did not interrupt another event. b) The event interrupted another event. In case a, as the write stamp was read and known to be correct, the delta between the current time stamp and the write stamp is the delta between the current event and the previously recorded event. In case b, extra space was reserved to just put the full time stamp into the buffer. Which is done, as stated, in this path the time stamp taken is known to match the location in the buffer. In path 2, there's also two sub paths we care about: a) The event was not interrupted by another event since it reserved space on the buffer and re-reading the write stamp. b) The event was interrupted by another event. In case a, the write stamp is that of the last event that interrupted this event between taking the time stamps and reserving. As no event came in after re-reading the write stamp, that event is known to be the time of the event directly before this event and the delta can be the new time stamp and the write stamp. In case b, one or more events came in between reserving the event and re-reading he write stamp. Since this event's buffer reservation is between other events at this path, there's no way to know what the delta is. But because an event interrupted this event after it started, its fine to just give a zero delta, and take the same time stamp as the events that happened within the event being recorded. Here's the implementation of the design of this solution: All this is per cpu, and only needs to worry about nested events (not parallel events). The players: write_tail: The index in the buffer where new events can be written to. It is incremented via local_add() to reserve space for a new event. before_stamp: A time stamp set by all events before reserving space. write_stamp: A time stamp updated by events after it has successfully reserved space. /* Save the current position of write */ [A] w = local_read(write_tail); barrier(); /* Read both before and write stamps before touching anything */ before = local_read(before_stamp); after = local_read(write_stamp); barrier(); /* * If before and after are the same, then this event is not * interrupting a time update. If it is, then reserve space for adding * a full time stamp (this can turn into a time extend which is * just an extended time delta but fill up the extra space). */ if (after != before) abs = true; ts = clock(); /* Now update the before_stamp (everyone does this!) */ [B] local_set(before_stamp, ts); /* Now reserve space on the buffer */ [C] write = local_add_return(len, write_tail); /* Set tail to be were this event's data is */ tail = write - len; if (w == tail) { /* Nothing interrupted this between A and C */ [D] local_set(write_stamp, ts); barrier(); [E] save_before = local_read(before_stamp); if (!abs) { /* This did not interrupt a time update */ delta = ts - after; } else { delta = ts; /* The full time stamp will be in use */ } if (ts != save_before) { /* slow path - Was interrupted between C and E */ /* The update to write_stamp could have overwritten the update to * it by the interrupting event, but before and after should be * the same for all completed top events */ after = local_read(write_stamp); if (save_before > after) local_cmpxchg(write_stamp, after, save_before); } } else { /* slow path - Interrupted between A and C */ after = local_read(write_stamp); temp_ts = clock(); barrier(); [F] if (write == local_read(write_tail) && after < temp_ts) { /* This was not interrupted since C and F * The last write_stamp is still valid for the previous event * in the buffer. */ delta = temp_ts - after; /* OK to keep this new time stamp */ ts = temp_ts; } else { /* Interrupted between C and F * Well, there's no use to try to know what the time stamp * is for the previous event. Just set delta to zero and * be the same time as that event that interrupted us before * the reservation of the buffer. */ delta = 0; } /* No need to use full timestamps here */ abs = 0; } Link: https://lkml.kernel.org/r/20200625094454.732790f7@oasis.local.home Link: https://lore.kernel.org/r/20200627010041.517736087@goodmis.org Link: http://lkml.kernel.org/r/20200629025258.957440797@goodmis.org Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:25 +08:00
};
/*
* Used for which event context the event is in.
* TRANSITION = 0
* NMI = 1
* IRQ = 2
* SOFTIRQ = 3
* NORMAL = 4
*
* See trace_recursive_lock() comment below for more details.
*/
enum {
RB_CTX_TRANSITION,
RB_CTX_NMI,
RB_CTX_IRQ,
RB_CTX_SOFTIRQ,
RB_CTX_NORMAL,
RB_CTX_MAX
};
ring-buffer: Add rb_time_t 64 bit operations for speeding up 32 bit After a discussion with the new time algorithm to have nested events still have proper time keeping but required using local64_t atomic operations. Mathieu was concerned about the performance this would have on 32 bit machines, as in most cases, atomic 64 bit operations on them can be expensive. As the ring buffer's timing needs do not require full features of local64_t, a wrapper is made to implement a new rb_time_t operation that uses two longs on 32 bit machines but still uses the local64_t operations on 64 bit machines. There's a switch that can be made in the file to force 64 bit to use the 32 bit version just for testing purposes. All reads do not need to succeed if a read happened while the stamp being read is in the process of being updated. The requirement is that all reads must succed that were done by an interrupting event (where this event was interrupted by another event that did the write). Or if the event itself did the write first. That is: rb_time_set(t, x) followed by rb_time_read(t) will always succeed (even if it gets interrupted by another event that writes to t. The result of the read will be either the previous set, or a set performed by an interrupting event. If the read is done by an event that interrupted another event that was in the process of setting the time stamp, and no other event came along to write to that time stamp, it will fail and the rb_time_read() will return that it failed (the value to read will be undefined). A set will always write to the time stamp and return with a valid time stamp, such that any read after it will be valid. A cmpxchg may fail if it interrupted an event that was in the process of updating the time stamp just like the reads do. Other than that, it will act like a normal cmpxchg. The way this works is that the rb_time_t is made of of three fields. A cnt, that gets updated atomically everyting a modification is made. A top that represents the most significant 30 bits of the time, and a bottom to represent the least significant 30 bits of the time. Notice, that the time values is only 60 bits long (where the ring buffer only uses 59 bits, which gives us 18 years of nanoseconds!). The top two bits of both the top and bottom is a 2 bit counter that gets set by the value of the least two significant bits of the cnt. A read of the top and the bottom where both the top and bottom have the same most significant top 2 bits, are considered a match and a valid 60 bit number can be created from it. If they do not match, then the number is considered invalid, and this must only happen if an event interrupted another event in the midst of updating the time stamp. This is only used for 32 bits machines as 64 bit machines can get better performance out of the local64_t. This has been tested heavily by forcing 64 bit to use this logic. Link: https://lore.kernel.org/r/20200625225345.18cf5881@oasis.local.home Link: http://lkml.kernel.org/r/20200629025259.309232719@goodmis.org Inspired-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:27 +08:00
#if BITS_PER_LONG == 32
#define RB_TIME_32
#endif
/* To test on 64 bit machines */
//#define RB_TIME_32
#ifdef RB_TIME_32
struct rb_time_struct {
local_t cnt;
local_t top;
local_t bottom;
local_t msb;
ring-buffer: Add rb_time_t 64 bit operations for speeding up 32 bit After a discussion with the new time algorithm to have nested events still have proper time keeping but required using local64_t atomic operations. Mathieu was concerned about the performance this would have on 32 bit machines, as in most cases, atomic 64 bit operations on them can be expensive. As the ring buffer's timing needs do not require full features of local64_t, a wrapper is made to implement a new rb_time_t operation that uses two longs on 32 bit machines but still uses the local64_t operations on 64 bit machines. There's a switch that can be made in the file to force 64 bit to use the 32 bit version just for testing purposes. All reads do not need to succeed if a read happened while the stamp being read is in the process of being updated. The requirement is that all reads must succed that were done by an interrupting event (where this event was interrupted by another event that did the write). Or if the event itself did the write first. That is: rb_time_set(t, x) followed by rb_time_read(t) will always succeed (even if it gets interrupted by another event that writes to t. The result of the read will be either the previous set, or a set performed by an interrupting event. If the read is done by an event that interrupted another event that was in the process of setting the time stamp, and no other event came along to write to that time stamp, it will fail and the rb_time_read() will return that it failed (the value to read will be undefined). A set will always write to the time stamp and return with a valid time stamp, such that any read after it will be valid. A cmpxchg may fail if it interrupted an event that was in the process of updating the time stamp just like the reads do. Other than that, it will act like a normal cmpxchg. The way this works is that the rb_time_t is made of of three fields. A cnt, that gets updated atomically everyting a modification is made. A top that represents the most significant 30 bits of the time, and a bottom to represent the least significant 30 bits of the time. Notice, that the time values is only 60 bits long (where the ring buffer only uses 59 bits, which gives us 18 years of nanoseconds!). The top two bits of both the top and bottom is a 2 bit counter that gets set by the value of the least two significant bits of the cnt. A read of the top and the bottom where both the top and bottom have the same most significant top 2 bits, are considered a match and a valid 60 bit number can be created from it. If they do not match, then the number is considered invalid, and this must only happen if an event interrupted another event in the midst of updating the time stamp. This is only used for 32 bits machines as 64 bit machines can get better performance out of the local64_t. This has been tested heavily by forcing 64 bit to use this logic. Link: https://lore.kernel.org/r/20200625225345.18cf5881@oasis.local.home Link: http://lkml.kernel.org/r/20200629025259.309232719@goodmis.org Inspired-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:27 +08:00
};
#else
#include <asm/local64.h>
struct rb_time_struct {
local64_t time;
};
#endif
typedef struct rb_time_struct rb_time_t;
#define MAX_NEST 5
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/*
* head_page == tail_page && head == tail then buffer is empty.
*/
struct ring_buffer_per_cpu {
int cpu;
atomic_t record_disabled;
atomic_t resize_disabled;
struct trace_buffer *buffer;
raw_spinlock_t reader_lock; /* serialize readers */
arch_spinlock_t lock;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
struct lock_class_key lock_key;
ring-buffer: Return reader page back into existing ring buffer When reading the ring buffer for consuming, it is optimized for splice, where a page is taken out of the ring buffer (zero copy) and sent to the reading consumer. When the read is finished with the page, it calls ring_buffer_free_read_page(), which simply frees the page. The next time the reader needs to get a page from the ring buffer, it must call ring_buffer_alloc_read_page() which allocates and initializes a reader page for the ring buffer to be swapped into the ring buffer for a new filled page for the reader. The problem is that there's no reason to actually free the page when it is passed back to the ring buffer. It can hold it off and reuse it for the next iteration. This completely removes the interaction with the page_alloc mechanism. Using the trace-cmd utility to record all events (causing trace-cmd to require reading lots of pages from the ring buffer, and calling ring_buffer_alloc/free_read_page() several times), and also assigning a stack trace trigger to the mm_page_alloc event, we can see how many times the ring_buffer_alloc_read_page() needed to allocate a page for the ring buffer. Before this change: # trace-cmd record -e all -e mem_page_alloc -R stacktrace sleep 1 # trace-cmd report |grep ring_buffer_alloc_read_page | wc -l 9968 After this change: # trace-cmd record -e all -e mem_page_alloc -R stacktrace sleep 1 # trace-cmd report |grep ring_buffer_alloc_read_page | wc -l 4 Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-05-01 21:35:09 +08:00
struct buffer_data_page *free_page;
ring-buffer: Use long for nr_pages to avoid overflow failures The size variable to change the ring buffer in ftrace is a long. The nr_pages used to update the ring buffer based on the size is int. On 64 bit machines this can cause an overflow problem. For example, the following will cause the ring buffer to crash: # cd /sys/kernel/debug/tracing # echo 10 > buffer_size_kb # echo 8556384240 > buffer_size_kb Then you get the warning of: WARNING: CPU: 1 PID: 318 at kernel/trace/ring_buffer.c:1527 rb_update_pages+0x22f/0x260 Which is: RB_WARN_ON(cpu_buffer, nr_removed); Note each ring buffer page holds 4080 bytes. This is because: 1) 10 causes the ring buffer to have 3 pages. (10kb requires 3 * 4080 pages to hold) 2) (2^31 / 2^10 + 1) * 4080 = 8556384240 The value written into buffer_size_kb is shifted by 10 and then passed to ring_buffer_resize(). 8556384240 * 2^10 = 8761737461760 3) The size passed to ring_buffer_resize() is then divided by BUF_PAGE_SIZE which is 4080. 8761737461760 / 4080 = 2147484672 4) nr_pages is subtracted from the current nr_pages (3) and we get: 2147484669. This value is saved in a signed integer nr_pages_to_update 5) 2147484669 is greater than 2^31 but smaller than 2^32, a signed int turns into the value of -2147482627 6) As the value is a negative number, in update_pages_handler() it is negated and passed to rb_remove_pages() and 2147482627 pages will be removed, which is much larger than 3 and it causes the warning because not all the pages asked to be removed were removed. Link: https://bugzilla.kernel.org/show_bug.cgi?id=118001 Cc: stable@vger.kernel.org # 2.6.28+ Fixes: 7a8e76a3829f1 ("tracing: unified trace buffer") Reported-by: Hao Qin <QEver.cn@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2016-05-12 23:01:24 +08:00
unsigned long nr_pages;
unsigned int current_context;
struct list_head *pages;
struct buffer_page *head_page; /* read from head */
struct buffer_page *tail_page; /* write to tail */
struct buffer_page *commit_page; /* committed pages */
struct buffer_page *reader_page;
ring-buffer: Add place holder recording of dropped events Currently, when the ring buffer drops events, it does not record the fact that it did so. It does inform the writer that the event was dropped by returning a NULL event, but it does not put in any place holder where the event was dropped. This is not a trivial thing to add because the ring buffer mostly runs in overwrite (flight recorder) mode. That is, when the ring buffer is full, new data will overwrite old data. In a produce/consumer mode, where new data is simply dropped when the ring buffer is full, it is trivial to add the placeholder for dropped events. When there's more room to write new data, then a special event can be added to notify the reader about the dropped events. But in overwrite mode, any new write can overwrite events. A place holder can not be inserted into the ring buffer since there never may be room. A reader could also come in at anytime and miss the placeholder. Luckily, the way the ring buffer works, the read side can find out if events were lost or not, and how many events. Everytime a write takes place, if it overwrites the header page (the next read) it updates a "overrun" variable that keeps track of the number of lost events. When a reader swaps out a page from the ring buffer, it can record this number, perfom the swap, and then check to see if the number changed, and take the diff if it has, which would be the number of events dropped. This can be stored by the reader and returned to callers of the reader. Since the reader page swap will fail if the writer moved the head page since the time the reader page set up the swap, this gives room to record the overruns without worrying about races. If the reader sets up the pages, records the overrun, than performs the swap, if the swap succeeds, then the overrun variable has not been updated since the setup before the swap. For binary readers of the ring buffer, a flag is set in the header of each sub page (sub buffer) of the ring buffer. This flag is embedded in the size field of the data on the sub buffer, in the 31st bit (the size can be 32 or 64 bits depending on the architecture), but only 27 bits needs to be used for the actual size (less actually). We could add a new field in the sub buffer header to also record the number of events dropped since the last read, but this will change the format of the binary ring buffer a bit too much. Perhaps this change can be made if the information on the number of events dropped is considered important enough. Note, the notification of dropped events is only used by consuming reads or peeking at the ring buffer. Iterating over the ring buffer does not keep this information because the necessary data is only available when a page swap is made, and the iterator does not swap out pages. Cc: Robert Richter <robert.richter@amd.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: "Luis Claudio R. Goncalves" <lclaudio@uudg.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-04-01 01:21:56 +08:00
unsigned long lost_events;
unsigned long last_overrun;
unsigned long nest;
local_t entries_bytes;
local_t entries;
local_t overrun;
local_t commit_overrun;
local_t dropped_events;
ring-buffer: use commit counters for commit pointer accounting The ring buffer is made up of three sets of pointers. The head page pointer, which points to the next page for the reader to get. The commit pointer and commit index, which points to the page and index of the last committed write respectively. The tail pointer and tail index, which points to the page and the index of the last reserved data respectively (non committed). The commit pointer is only moved forward by the outer most writer. If a nested writer comes in, it will not move the pointer forward. The current implementation has a flaw. It assumes that the outer most writer successfully reserved data. There's a small race window where the outer most writer could find the tail pointer, but a nested writer could come in (via interrupt) and move the tail forward, and even the commit forward. The outer writer would not realized the commit moved forward and the accounting will break. This patch changes the design to use counters in the per cpu buffers to keep track of commits. The counters are incremented at the start of the commit, and decremented at the end. If the end commit counter is 1, then it moves the commit pointers. A loop is made to check for races between checking and moving the commit pointers. Only the outer commit should move the pointers anyway. The test of knowing if a reserve is equal to the last commit update is still needed to know for time keeping. The time code is much less racey than the commit updates. This change not only solves the mentioned race, but also makes the code simpler. [ Impact: fix commit race and simplify code ] Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-06-17 00:37:57 +08:00
local_t committing;
local_t commits;
local_t pages_touched;
local_t pages_lost;
local_t pages_read;
long last_pages_touch;
size_t shortest_full;
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
unsigned long read;
unsigned long read_bytes;
ring-buffer: Add rb_time_t 64 bit operations for speeding up 32 bit After a discussion with the new time algorithm to have nested events still have proper time keeping but required using local64_t atomic operations. Mathieu was concerned about the performance this would have on 32 bit machines, as in most cases, atomic 64 bit operations on them can be expensive. As the ring buffer's timing needs do not require full features of local64_t, a wrapper is made to implement a new rb_time_t operation that uses two longs on 32 bit machines but still uses the local64_t operations on 64 bit machines. There's a switch that can be made in the file to force 64 bit to use the 32 bit version just for testing purposes. All reads do not need to succeed if a read happened while the stamp being read is in the process of being updated. The requirement is that all reads must succed that were done by an interrupting event (where this event was interrupted by another event that did the write). Or if the event itself did the write first. That is: rb_time_set(t, x) followed by rb_time_read(t) will always succeed (even if it gets interrupted by another event that writes to t. The result of the read will be either the previous set, or a set performed by an interrupting event. If the read is done by an event that interrupted another event that was in the process of setting the time stamp, and no other event came along to write to that time stamp, it will fail and the rb_time_read() will return that it failed (the value to read will be undefined). A set will always write to the time stamp and return with a valid time stamp, such that any read after it will be valid. A cmpxchg may fail if it interrupted an event that was in the process of updating the time stamp just like the reads do. Other than that, it will act like a normal cmpxchg. The way this works is that the rb_time_t is made of of three fields. A cnt, that gets updated atomically everyting a modification is made. A top that represents the most significant 30 bits of the time, and a bottom to represent the least significant 30 bits of the time. Notice, that the time values is only 60 bits long (where the ring buffer only uses 59 bits, which gives us 18 years of nanoseconds!). The top two bits of both the top and bottom is a 2 bit counter that gets set by the value of the least two significant bits of the cnt. A read of the top and the bottom where both the top and bottom have the same most significant top 2 bits, are considered a match and a valid 60 bit number can be created from it. If they do not match, then the number is considered invalid, and this must only happen if an event interrupted another event in the midst of updating the time stamp. This is only used for 32 bits machines as 64 bit machines can get better performance out of the local64_t. This has been tested heavily by forcing 64 bit to use this logic. Link: https://lore.kernel.org/r/20200625225345.18cf5881@oasis.local.home Link: http://lkml.kernel.org/r/20200629025259.309232719@goodmis.org Inspired-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:27 +08:00
rb_time_t write_stamp;
rb_time_t before_stamp;
u64 event_stamp[MAX_NEST];
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
u64 read_stamp;
/* pages removed since last reset */
unsigned long pages_removed;
/* ring buffer pages to update, > 0 to add, < 0 to remove */
ring-buffer: Use long for nr_pages to avoid overflow failures The size variable to change the ring buffer in ftrace is a long. The nr_pages used to update the ring buffer based on the size is int. On 64 bit machines this can cause an overflow problem. For example, the following will cause the ring buffer to crash: # cd /sys/kernel/debug/tracing # echo 10 > buffer_size_kb # echo 8556384240 > buffer_size_kb Then you get the warning of: WARNING: CPU: 1 PID: 318 at kernel/trace/ring_buffer.c:1527 rb_update_pages+0x22f/0x260 Which is: RB_WARN_ON(cpu_buffer, nr_removed); Note each ring buffer page holds 4080 bytes. This is because: 1) 10 causes the ring buffer to have 3 pages. (10kb requires 3 * 4080 pages to hold) 2) (2^31 / 2^10 + 1) * 4080 = 8556384240 The value written into buffer_size_kb is shifted by 10 and then passed to ring_buffer_resize(). 8556384240 * 2^10 = 8761737461760 3) The size passed to ring_buffer_resize() is then divided by BUF_PAGE_SIZE which is 4080. 8761737461760 / 4080 = 2147484672 4) nr_pages is subtracted from the current nr_pages (3) and we get: 2147484669. This value is saved in a signed integer nr_pages_to_update 5) 2147484669 is greater than 2^31 but smaller than 2^32, a signed int turns into the value of -2147482627 6) As the value is a negative number, in update_pages_handler() it is negated and passed to rb_remove_pages() and 2147482627 pages will be removed, which is much larger than 3 and it causes the warning because not all the pages asked to be removed were removed. Link: https://bugzilla.kernel.org/show_bug.cgi?id=118001 Cc: stable@vger.kernel.org # 2.6.28+ Fixes: 7a8e76a3829f1 ("tracing: unified trace buffer") Reported-by: Hao Qin <QEver.cn@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2016-05-12 23:01:24 +08:00
long nr_pages_to_update;
struct list_head new_pages; /* new pages to add */
struct work_struct update_pages_work;
struct completion update_done;
struct rb_irq_work irq_work;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
};
struct trace_buffer {
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
unsigned flags;
int cpus;
atomic_t record_disabled;
ring-buffer: Do not swap cpu_buffer during resize process When ring_buffer_swap_cpu was called during resize process, the cpu buffer was swapped in the middle, resulting in incorrect state. Continuing to run in the wrong state will result in oops. This issue can be easily reproduced using the following two scripts: /tmp # cat test1.sh //#! /bin/sh for i in `seq 0 100000` do echo 2000 > /sys/kernel/debug/tracing/buffer_size_kb sleep 0.5 echo 5000 > /sys/kernel/debug/tracing/buffer_size_kb sleep 0.5 done /tmp # cat test2.sh //#! /bin/sh for i in `seq 0 100000` do echo irqsoff > /sys/kernel/debug/tracing/current_tracer sleep 1 echo nop > /sys/kernel/debug/tracing/current_tracer sleep 1 done /tmp # ./test1.sh & /tmp # ./test2.sh & A typical oops log is as follows, sometimes with other different oops logs. [ 231.711293] WARNING: CPU: 0 PID: 9 at kernel/trace/ring_buffer.c:2026 rb_update_pages+0x378/0x3f8 [ 231.713375] Modules linked in: [ 231.714735] CPU: 0 PID: 9 Comm: kworker/0:1 Tainted: G W 6.5.0-rc1-00276-g20edcec23f92 #15 [ 231.716750] Hardware name: linux,dummy-virt (DT) [ 231.718152] Workqueue: events update_pages_handler [ 231.719714] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 231.721171] pc : rb_update_pages+0x378/0x3f8 [ 231.722212] lr : rb_update_pages+0x25c/0x3f8 [ 231.723248] sp : ffff800082b9bd50 [ 231.724169] x29: ffff800082b9bd50 x28: ffff8000825f7000 x27: 0000000000000000 [ 231.726102] x26: 0000000000000001 x25: fffffffffffff010 x24: 0000000000000ff0 [ 231.728122] x23: ffff0000c3a0b600 x22: ffff0000c3a0b5c0 x21: fffffffffffffe0a [ 231.730203] x20: ffff0000c3a0b600 x19: ffff0000c0102400 x18: 0000000000000000 [ 231.732329] x17: 0000000000000000 x16: 0000000000000000 x15: 0000ffffe7aa8510 [ 231.734212] x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000002 [ 231.736291] x11: ffff8000826998a8 x10: ffff800082b9baf0 x9 : ffff800081137558 [ 231.738195] x8 : fffffc00030e82c8 x7 : 0000000000000000 x6 : 0000000000000001 [ 231.740192] x5 : ffff0000ffbafe00 x4 : 0000000000000000 x3 : 0000000000000000 [ 231.742118] x2 : 00000000000006aa x1 : 0000000000000001 x0 : ffff0000c0007208 [ 231.744196] Call trace: [ 231.744892] rb_update_pages+0x378/0x3f8 [ 231.745893] update_pages_handler+0x1c/0x38 [ 231.746893] process_one_work+0x1f0/0x468 [ 231.747852] worker_thread+0x54/0x410 [ 231.748737] kthread+0x124/0x138 [ 231.749549] ret_from_fork+0x10/0x20 [ 231.750434] ---[ end trace 0000000000000000 ]--- [ 233.720486] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 [ 233.721696] Mem abort info: [ 233.721935] ESR = 0x0000000096000004 [ 233.722283] EC = 0x25: DABT (current EL), IL = 32 bits [ 233.722596] SET = 0, FnV = 0 [ 233.722805] EA = 0, S1PTW = 0 [ 233.723026] FSC = 0x04: level 0 translation fault [ 233.723458] Data abort info: [ 233.723734] ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000 [ 233.724176] CM = 0, WnR = 0, TnD = 0, TagAccess = 0 [ 233.724589] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 233.725075] user pgtable: 4k pages, 48-bit VAs, pgdp=0000000104943000 [ 233.725592] [0000000000000000] pgd=0000000000000000, p4d=0000000000000000 [ 233.726231] Internal error: Oops: 0000000096000004 [#1] PREEMPT SMP [ 233.726720] Modules linked in: [ 233.727007] CPU: 0 PID: 9 Comm: kworker/0:1 Tainted: G W 6.5.0-rc1-00276-g20edcec23f92 #15 [ 233.727777] Hardware name: linux,dummy-virt (DT) [ 233.728225] Workqueue: events update_pages_handler [ 233.728655] pstate: 200000c5 (nzCv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 233.729054] pc : rb_update_pages+0x1a8/0x3f8 [ 233.729334] lr : rb_update_pages+0x154/0x3f8 [ 233.729592] sp : ffff800082b9bd50 [ 233.729792] x29: ffff800082b9bd50 x28: ffff8000825f7000 x27: 0000000000000000 [ 233.730220] x26: 0000000000000000 x25: ffff800082a8b840 x24: ffff0000c0102418 [ 233.730653] x23: 0000000000000000 x22: fffffc000304c880 x21: 0000000000000003 [ 233.731105] x20: 00000000000001f4 x19: ffff0000c0102400 x18: ffff800082fcbc58 [ 233.731727] x17: 0000000000000000 x16: 0000000000000001 x15: 0000000000000001 [ 233.732282] x14: ffff8000825fe0c8 x13: 0000000000000001 x12: 0000000000000000 [ 233.732709] x11: ffff8000826998a8 x10: 0000000000000ae0 x9 : ffff8000801b760c [ 233.733148] x8 : fefefefefefefeff x7 : 0000000000000018 x6 : ffff0000c03298c0 [ 233.733553] x5 : 0000000000000002 x4 : 0000000000000000 x3 : 0000000000000000 [ 233.733972] x2 : ffff0000c3a0b600 x1 : 0000000000000000 x0 : 0000000000000000 [ 233.734418] Call trace: [ 233.734593] rb_update_pages+0x1a8/0x3f8 [ 233.734853] update_pages_handler+0x1c/0x38 [ 233.735148] process_one_work+0x1f0/0x468 [ 233.735525] worker_thread+0x54/0x410 [ 233.735852] kthread+0x124/0x138 [ 233.736064] ret_from_fork+0x10/0x20 [ 233.736387] Code: 92400000 910006b5 aa000021 aa0303f7 (f9400060) [ 233.736959] ---[ end trace 0000000000000000 ]--- After analysis, the seq of the error is as follows [1-5]: int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, int cpu_id) { for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; //1. get cpu_buffer, aka cpu_buffer(A) ... ... schedule_work_on(cpu, &cpu_buffer->update_pages_work); //2. 'update_pages_work' is queue on 'cpu', cpu_buffer(A) is passed to // update_pages_handler, do the update process, set 'update_done' in // complete(&cpu_buffer->update_done) and to wakeup resize process. //----> //3. Just at this moment, ring_buffer_swap_cpu is triggered, //cpu_buffer(A) be swaped to cpu_buffer(B), the max_buffer. //ring_buffer_swap_cpu is called as the 'Call trace' below. Call trace: dump_backtrace+0x0/0x2f8 show_stack+0x18/0x28 dump_stack+0x12c/0x188 ring_buffer_swap_cpu+0x2f8/0x328 update_max_tr_single+0x180/0x210 check_critical_timing+0x2b4/0x2c8 tracer_hardirqs_on+0x1c0/0x200 trace_hardirqs_on+0xec/0x378 el0_svc_common+0x64/0x260 do_el0_svc+0x90/0xf8 el0_svc+0x20/0x30 el0_sync_handler+0xb0/0xb8 el0_sync+0x180/0x1c0 //<---- /* wait for all the updates to complete */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; //4. get cpu_buffer, cpu_buffer(B) is used in the following process, //the state of cpu_buffer(A) and cpu_buffer(B) is totally wrong. //for example, cpu_buffer(A)->update_done will leave be set 1, and will //not 'wait_for_completion' at the next resize round. if (!cpu_buffer->nr_pages_to_update) continue; if (cpu_online(cpu)) wait_for_completion(&cpu_buffer->update_done); cpu_buffer->nr_pages_to_update = 0; } ... } //5. the state of cpu_buffer(A) and cpu_buffer(B) is totally wrong, //Continuing to run in the wrong state, then oops occurs. Link: https://lore.kernel.org/linux-trace-kernel/202307191558478409990@zte.com.cn Signed-off-by: Chen Lin <chen.lin5@zte.com.cn> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-07-19 15:58:47 +08:00
atomic_t resizing;
ring_buffer: pahole struct ring_buffer While fixing some bugs in pahole (built-in.o files were not being processed due to relocation problems) I found out about these packable structures: $ pahole --packable kernel/trace/ring_buffer.o | grep ring ring_buffer 72 64 8 ring_buffer_per_cpu 112 104 8 If we take a look at the current layout of struct ring_buffer we can see that we have two 4 bytes holes. $ pahole -C ring_buffer kernel/trace/ring_buffer.o struct ring_buffer { unsigned int pages; /* 0 4 */ unsigned int flags; /* 4 4 */ int cpus; /* 8 4 */ /* XXX 4 bytes hole, try to pack */ cpumask_var_t cpumask; /* 16 8 */ atomic_t record_disabled; /* 24 4 */ /* XXX 4 bytes hole, try to pack */ struct mutex mutex; /* 32 32 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct ring_buffer_per_cpu * * buffers; /* 64 8 */ /* size: 72, cachelines: 2, members: 7 */ /* sum members: 64, holes: 2, sum holes: 8 */ /* last cacheline: 8 bytes */ }; So, if I ask pahole to reorganize it: $ pahole -C ring_buffer --reorganize kernel/trace/ring_buffer.o struct ring_buffer { unsigned int pages; /* 0 4 */ unsigned int flags; /* 4 4 */ int cpus; /* 8 4 */ atomic_t record_disabled; /* 12 4 */ cpumask_var_t cpumask; /* 16 8 */ struct mutex mutex; /* 24 32 */ struct ring_buffer_per_cpu * * buffers; /* 56 8 */ /* --- cacheline 1 boundary (64 bytes) --- */ /* size: 64, cachelines: 1, members: 7 */ }; /* saved 8 bytes and 1 cacheline! */ We get it using just one 64 bytes cacheline. To see what it did: $ pahole -C ring_buffer --reorganize --show_reorg_steps \ kernel/trace/ring_buffer.o | grep \/ /* Moving 'record_disabled' from after 'cpumask' to after 'cpus' */ Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-02-10 03:04:06 +08:00
cpumask_var_t cpumask;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
ring-buffer: pass in lockdep class key for reader_lock On Sun, 7 Jun 2009, Ingo Molnar wrote: > Testing tracer sched_switch: <6>Starting ring buffer hammer > PASSED > Testing tracer sysprof: PASSED > Testing tracer function: PASSED > Testing tracer irqsoff: > ============================================= > PASSED > Testing tracer preemptoff: PASSED > Testing tracer preemptirqsoff: [ INFO: possible recursive locking detected ] > PASSED > Testing tracer branch: 2.6.30-rc8-tip-01972-ge5b9078-dirty #5760 > --------------------------------------------- > rb_consumer/431 is trying to acquire lock: > (&cpu_buffer->reader_lock){......}, at: [<c109eef7>] ring_buffer_reset_cpu+0x37/0x70 > > but task is already holding lock: > (&cpu_buffer->reader_lock){......}, at: [<c10a019e>] ring_buffer_consume+0x7e/0xc0 > > other info that might help us debug this: > 1 lock held by rb_consumer/431: > #0: (&cpu_buffer->reader_lock){......}, at: [<c10a019e>] ring_buffer_consume+0x7e/0xc0 The ring buffer is a generic structure, and can be used outside of ftrace. If ftrace traces within the use of the ring buffer, it can produce false positives with lockdep. This patch passes in a static lock key into the allocation of the ring buffer, so that different ring buffers will have their own lock class. Reported-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <1244477919.13761.9042.camel@twins> [ store key in ring buffer descriptor ] Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-06-09 00:18:39 +08:00
struct lock_class_key *reader_lock_key;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
struct mutex mutex;
struct ring_buffer_per_cpu **buffers;
struct hlist_node node;
u64 (*clock)(void);
struct rb_irq_work irq_work;
bool time_stamp_abs;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
};
struct ring_buffer_iter {
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long head;
unsigned long next_event;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
struct buffer_page *head_page;
struct buffer_page *cache_reader_page;
unsigned long cache_read;
unsigned long cache_pages_removed;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
u64 read_stamp;
u64 page_stamp;
struct ring_buffer_event *event;
int missed_events;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
};
ring-buffer: Add rb_time_t 64 bit operations for speeding up 32 bit After a discussion with the new time algorithm to have nested events still have proper time keeping but required using local64_t atomic operations. Mathieu was concerned about the performance this would have on 32 bit machines, as in most cases, atomic 64 bit operations on them can be expensive. As the ring buffer's timing needs do not require full features of local64_t, a wrapper is made to implement a new rb_time_t operation that uses two longs on 32 bit machines but still uses the local64_t operations on 64 bit machines. There's a switch that can be made in the file to force 64 bit to use the 32 bit version just for testing purposes. All reads do not need to succeed if a read happened while the stamp being read is in the process of being updated. The requirement is that all reads must succed that were done by an interrupting event (where this event was interrupted by another event that did the write). Or if the event itself did the write first. That is: rb_time_set(t, x) followed by rb_time_read(t) will always succeed (even if it gets interrupted by another event that writes to t. The result of the read will be either the previous set, or a set performed by an interrupting event. If the read is done by an event that interrupted another event that was in the process of setting the time stamp, and no other event came along to write to that time stamp, it will fail and the rb_time_read() will return that it failed (the value to read will be undefined). A set will always write to the time stamp and return with a valid time stamp, such that any read after it will be valid. A cmpxchg may fail if it interrupted an event that was in the process of updating the time stamp just like the reads do. Other than that, it will act like a normal cmpxchg. The way this works is that the rb_time_t is made of of three fields. A cnt, that gets updated atomically everyting a modification is made. A top that represents the most significant 30 bits of the time, and a bottom to represent the least significant 30 bits of the time. Notice, that the time values is only 60 bits long (where the ring buffer only uses 59 bits, which gives us 18 years of nanoseconds!). The top two bits of both the top and bottom is a 2 bit counter that gets set by the value of the least two significant bits of the cnt. A read of the top and the bottom where both the top and bottom have the same most significant top 2 bits, are considered a match and a valid 60 bit number can be created from it. If they do not match, then the number is considered invalid, and this must only happen if an event interrupted another event in the midst of updating the time stamp. This is only used for 32 bits machines as 64 bit machines can get better performance out of the local64_t. This has been tested heavily by forcing 64 bit to use this logic. Link: https://lore.kernel.org/r/20200625225345.18cf5881@oasis.local.home Link: http://lkml.kernel.org/r/20200629025259.309232719@goodmis.org Inspired-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:27 +08:00
#ifdef RB_TIME_32
/*
* On 32 bit machines, local64_t is very expensive. As the ring
* buffer doesn't need all the features of a true 64 bit atomic,
* on 32 bit, it uses these functions (64 still uses local64_t).
*
* For the ring buffer, 64 bit required operations for the time is
* the following:
*
* - Reads may fail if it interrupted a modification of the time stamp.
* It will succeed if it did not interrupt another write even if
* the read itself is interrupted by a write.
* It returns whether it was successful or not.
*
* - Writes always succeed and will overwrite other writes and writes
* that were done by events interrupting the current write.
*
* - A write followed by a read of the same time stamp will always succeed,
* but may not contain the same value.
*
* - A cmpxchg will fail if it interrupted another write or cmpxchg.
* Other than that, it acts like a normal cmpxchg.
*
* The 60 bit time stamp is broken up by 30 bits in a top and bottom half
* (bottom being the least significant 30 bits of the 60 bit time stamp).
*
* The two most significant bits of each half holds a 2 bit counter (0-3).
* Each update will increment this counter by one.
* When reading the top and bottom, if the two counter bits match then the
* top and bottom together make a valid 60 bit number.
*/
#define RB_TIME_SHIFT 30
#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1)
#define RB_TIME_MSB_SHIFT 60
ring-buffer: Add rb_time_t 64 bit operations for speeding up 32 bit After a discussion with the new time algorithm to have nested events still have proper time keeping but required using local64_t atomic operations. Mathieu was concerned about the performance this would have on 32 bit machines, as in most cases, atomic 64 bit operations on them can be expensive. As the ring buffer's timing needs do not require full features of local64_t, a wrapper is made to implement a new rb_time_t operation that uses two longs on 32 bit machines but still uses the local64_t operations on 64 bit machines. There's a switch that can be made in the file to force 64 bit to use the 32 bit version just for testing purposes. All reads do not need to succeed if a read happened while the stamp being read is in the process of being updated. The requirement is that all reads must succed that were done by an interrupting event (where this event was interrupted by another event that did the write). Or if the event itself did the write first. That is: rb_time_set(t, x) followed by rb_time_read(t) will always succeed (even if it gets interrupted by another event that writes to t. The result of the read will be either the previous set, or a set performed by an interrupting event. If the read is done by an event that interrupted another event that was in the process of setting the time stamp, and no other event came along to write to that time stamp, it will fail and the rb_time_read() will return that it failed (the value to read will be undefined). A set will always write to the time stamp and return with a valid time stamp, such that any read after it will be valid. A cmpxchg may fail if it interrupted an event that was in the process of updating the time stamp just like the reads do. Other than that, it will act like a normal cmpxchg. The way this works is that the rb_time_t is made of of three fields. A cnt, that gets updated atomically everyting a modification is made. A top that represents the most significant 30 bits of the time, and a bottom to represent the least significant 30 bits of the time. Notice, that the time values is only 60 bits long (where the ring buffer only uses 59 bits, which gives us 18 years of nanoseconds!). The top two bits of both the top and bottom is a 2 bit counter that gets set by the value of the least two significant bits of the cnt. A read of the top and the bottom where both the top and bottom have the same most significant top 2 bits, are considered a match and a valid 60 bit number can be created from it. If they do not match, then the number is considered invalid, and this must only happen if an event interrupted another event in the midst of updating the time stamp. This is only used for 32 bits machines as 64 bit machines can get better performance out of the local64_t. This has been tested heavily by forcing 64 bit to use this logic. Link: https://lore.kernel.org/r/20200625225345.18cf5881@oasis.local.home Link: http://lkml.kernel.org/r/20200629025259.309232719@goodmis.org Inspired-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:27 +08:00
static inline int rb_time_cnt(unsigned long val)
{
return (val >> RB_TIME_SHIFT) & 3;
}
static inline u64 rb_time_val(unsigned long top, unsigned long bottom)
{
u64 val;
val = top & RB_TIME_VAL_MASK;
val <<= RB_TIME_SHIFT;
val |= bottom & RB_TIME_VAL_MASK;
return val;
}
static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
{
unsigned long top, bottom, msb;
ring-buffer: Add rb_time_t 64 bit operations for speeding up 32 bit After a discussion with the new time algorithm to have nested events still have proper time keeping but required using local64_t atomic operations. Mathieu was concerned about the performance this would have on 32 bit machines, as in most cases, atomic 64 bit operations on them can be expensive. As the ring buffer's timing needs do not require full features of local64_t, a wrapper is made to implement a new rb_time_t operation that uses two longs on 32 bit machines but still uses the local64_t operations on 64 bit machines. There's a switch that can be made in the file to force 64 bit to use the 32 bit version just for testing purposes. All reads do not need to succeed if a read happened while the stamp being read is in the process of being updated. The requirement is that all reads must succed that were done by an interrupting event (where this event was interrupted by another event that did the write). Or if the event itself did the write first. That is: rb_time_set(t, x) followed by rb_time_read(t) will always succeed (even if it gets interrupted by another event that writes to t. The result of the read will be either the previous set, or a set performed by an interrupting event. If the read is done by an event that interrupted another event that was in the process of setting the time stamp, and no other event came along to write to that time stamp, it will fail and the rb_time_read() will return that it failed (the value to read will be undefined). A set will always write to the time stamp and return with a valid time stamp, such that any read after it will be valid. A cmpxchg may fail if it interrupted an event that was in the process of updating the time stamp just like the reads do. Other than that, it will act like a normal cmpxchg. The way this works is that the rb_time_t is made of of three fields. A cnt, that gets updated atomically everyting a modification is made. A top that represents the most significant 30 bits of the time, and a bottom to represent the least significant 30 bits of the time. Notice, that the time values is only 60 bits long (where the ring buffer only uses 59 bits, which gives us 18 years of nanoseconds!). The top two bits of both the top and bottom is a 2 bit counter that gets set by the value of the least two significant bits of the cnt. A read of the top and the bottom where both the top and bottom have the same most significant top 2 bits, are considered a match and a valid 60 bit number can be created from it. If they do not match, then the number is considered invalid, and this must only happen if an event interrupted another event in the midst of updating the time stamp. This is only used for 32 bits machines as 64 bit machines can get better performance out of the local64_t. This has been tested heavily by forcing 64 bit to use this logic. Link: https://lore.kernel.org/r/20200625225345.18cf5881@oasis.local.home Link: http://lkml.kernel.org/r/20200629025259.309232719@goodmis.org Inspired-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:27 +08:00
unsigned long c;
/*
* If the read is interrupted by a write, then the cnt will
* be different. Loop until both top and bottom have been read
* without interruption.
*/
do {
c = local_read(&t->cnt);
top = local_read(&t->top);
bottom = local_read(&t->bottom);
msb = local_read(&t->msb);
ring-buffer: Add rb_time_t 64 bit operations for speeding up 32 bit After a discussion with the new time algorithm to have nested events still have proper time keeping but required using local64_t atomic operations. Mathieu was concerned about the performance this would have on 32 bit machines, as in most cases, atomic 64 bit operations on them can be expensive. As the ring buffer's timing needs do not require full features of local64_t, a wrapper is made to implement a new rb_time_t operation that uses two longs on 32 bit machines but still uses the local64_t operations on 64 bit machines. There's a switch that can be made in the file to force 64 bit to use the 32 bit version just for testing purposes. All reads do not need to succeed if a read happened while the stamp being read is in the process of being updated. The requirement is that all reads must succed that were done by an interrupting event (where this event was interrupted by another event that did the write). Or if the event itself did the write first. That is: rb_time_set(t, x) followed by rb_time_read(t) will always succeed (even if it gets interrupted by another event that writes to t. The result of the read will be either the previous set, or a set performed by an interrupting event. If the read is done by an event that interrupted another event that was in the process of setting the time stamp, and no other event came along to write to that time stamp, it will fail and the rb_time_read() will return that it failed (the value to read will be undefined). A set will always write to the time stamp and return with a valid time stamp, such that any read after it will be valid. A cmpxchg may fail if it interrupted an event that was in the process of updating the time stamp just like the reads do. Other than that, it will act like a normal cmpxchg. The way this works is that the rb_time_t is made of of three fields. A cnt, that gets updated atomically everyting a modification is made. A top that represents the most significant 30 bits of the time, and a bottom to represent the least significant 30 bits of the time. Notice, that the time values is only 60 bits long (where the ring buffer only uses 59 bits, which gives us 18 years of nanoseconds!). The top two bits of both the top and bottom is a 2 bit counter that gets set by the value of the least two significant bits of the cnt. A read of the top and the bottom where both the top and bottom have the same most significant top 2 bits, are considered a match and a valid 60 bit number can be created from it. If they do not match, then the number is considered invalid, and this must only happen if an event interrupted another event in the midst of updating the time stamp. This is only used for 32 bits machines as 64 bit machines can get better performance out of the local64_t. This has been tested heavily by forcing 64 bit to use this logic. Link: https://lore.kernel.org/r/20200625225345.18cf5881@oasis.local.home Link: http://lkml.kernel.org/r/20200629025259.309232719@goodmis.org Inspired-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:27 +08:00
} while (c != local_read(&t->cnt));
*cnt = rb_time_cnt(top);
ring-buffer: Fix 32-bit rb_time_read() race with rb_time_cmpxchg() The following race can cause rb_time_read() to observe a corrupted time stamp: rb_time_cmpxchg() [...] if (!rb_time_read_cmpxchg(&t->msb, msb, msb2)) return false; if (!rb_time_read_cmpxchg(&t->top, top, top2)) return false; <interrupted before updating bottom> __rb_time_read() [...] do { c = local_read(&t->cnt); top = local_read(&t->top); bottom = local_read(&t->bottom); msb = local_read(&t->msb); } while (c != local_read(&t->cnt)); *cnt = rb_time_cnt(top); /* If top and msb counts don't match, this interrupted a write */ if (*cnt != rb_time_cnt(msb)) return false; ^ this check fails to catch that "bottom" is still not updated. So the old "bottom" value is returned, which is wrong. Fix this by checking that all three of msb, top, and bottom 2-bit cnt values match. The reason to favor checking all three fields over requiring a specific update order for both rb_time_set() and rb_time_cmpxchg() is because checking all three fields is more robust to handle partial failures of rb_time_cmpxchg() when interrupted by nested rb_time_set(). Link: https://lore.kernel.org/lkml/20231211201324.652870-1-mathieu.desnoyers@efficios.com/ Link: https://lore.kernel.org/linux-trace-kernel/20231212193049.680122-1-mathieu.desnoyers@efficios.com Fixes: f458a1453424e ("ring-buffer: Test last update in 32bit version of __rb_time_read()") Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-12-13 03:30:49 +08:00
/* If top, msb or bottom counts don't match, this interrupted a write */
if (*cnt != rb_time_cnt(msb) || *cnt != rb_time_cnt(bottom))
ring-buffer: Add rb_time_t 64 bit operations for speeding up 32 bit After a discussion with the new time algorithm to have nested events still have proper time keeping but required using local64_t atomic operations. Mathieu was concerned about the performance this would have on 32 bit machines, as in most cases, atomic 64 bit operations on them can be expensive. As the ring buffer's timing needs do not require full features of local64_t, a wrapper is made to implement a new rb_time_t operation that uses two longs on 32 bit machines but still uses the local64_t operations on 64 bit machines. There's a switch that can be made in the file to force 64 bit to use the 32 bit version just for testing purposes. All reads do not need to succeed if a read happened while the stamp being read is in the process of being updated. The requirement is that all reads must succed that were done by an interrupting event (where this event was interrupted by another event that did the write). Or if the event itself did the write first. That is: rb_time_set(t, x) followed by rb_time_read(t) will always succeed (even if it gets interrupted by another event that writes to t. The result of the read will be either the previous set, or a set performed by an interrupting event. If the read is done by an event that interrupted another event that was in the process of setting the time stamp, and no other event came along to write to that time stamp, it will fail and the rb_time_read() will return that it failed (the value to read will be undefined). A set will always write to the time stamp and return with a valid time stamp, such that any read after it will be valid. A cmpxchg may fail if it interrupted an event that was in the process of updating the time stamp just like the reads do. Other than that, it will act like a normal cmpxchg. The way this works is that the rb_time_t is made of of three fields. A cnt, that gets updated atomically everyting a modification is made. A top that represents the most significant 30 bits of the time, and a bottom to represent the least significant 30 bits of the time. Notice, that the time values is only 60 bits long (where the ring buffer only uses 59 bits, which gives us 18 years of nanoseconds!). The top two bits of both the top and bottom is a 2 bit counter that gets set by the value of the least two significant bits of the cnt. A read of the top and the bottom where both the top and bottom have the same most significant top 2 bits, are considered a match and a valid 60 bit number can be created from it. If they do not match, then the number is considered invalid, and this must only happen if an event interrupted another event in the midst of updating the time stamp. This is only used for 32 bits machines as 64 bit machines can get better performance out of the local64_t. This has been tested heavily by forcing 64 bit to use this logic. Link: https://lore.kernel.org/r/20200625225345.18cf5881@oasis.local.home Link: http://lkml.kernel.org/r/20200629025259.309232719@goodmis.org Inspired-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:27 +08:00
return false;
/* The shift to msb will lose its cnt bits */
*ret = rb_time_val(top, bottom) | ((u64)msb << RB_TIME_MSB_SHIFT);
ring-buffer: Add rb_time_t 64 bit operations for speeding up 32 bit After a discussion with the new time algorithm to have nested events still have proper time keeping but required using local64_t atomic operations. Mathieu was concerned about the performance this would have on 32 bit machines, as in most cases, atomic 64 bit operations on them can be expensive. As the ring buffer's timing needs do not require full features of local64_t, a wrapper is made to implement a new rb_time_t operation that uses two longs on 32 bit machines but still uses the local64_t operations on 64 bit machines. There's a switch that can be made in the file to force 64 bit to use the 32 bit version just for testing purposes. All reads do not need to succeed if a read happened while the stamp being read is in the process of being updated. The requirement is that all reads must succed that were done by an interrupting event (where this event was interrupted by another event that did the write). Or if the event itself did the write first. That is: rb_time_set(t, x) followed by rb_time_read(t) will always succeed (even if it gets interrupted by another event that writes to t. The result of the read will be either the previous set, or a set performed by an interrupting event. If the read is done by an event that interrupted another event that was in the process of setting the time stamp, and no other event came along to write to that time stamp, it will fail and the rb_time_read() will return that it failed (the value to read will be undefined). A set will always write to the time stamp and return with a valid time stamp, such that any read after it will be valid. A cmpxchg may fail if it interrupted an event that was in the process of updating the time stamp just like the reads do. Other than that, it will act like a normal cmpxchg. The way this works is that the rb_time_t is made of of three fields. A cnt, that gets updated atomically everyting a modification is made. A top that represents the most significant 30 bits of the time, and a bottom to represent the least significant 30 bits of the time. Notice, that the time values is only 60 bits long (where the ring buffer only uses 59 bits, which gives us 18 years of nanoseconds!). The top two bits of both the top and bottom is a 2 bit counter that gets set by the value of the least two significant bits of the cnt. A read of the top and the bottom where both the top and bottom have the same most significant top 2 bits, are considered a match and a valid 60 bit number can be created from it. If they do not match, then the number is considered invalid, and this must only happen if an event interrupted another event in the midst of updating the time stamp. This is only used for 32 bits machines as 64 bit machines can get better performance out of the local64_t. This has been tested heavily by forcing 64 bit to use this logic. Link: https://lore.kernel.org/r/20200625225345.18cf5881@oasis.local.home Link: http://lkml.kernel.org/r/20200629025259.309232719@goodmis.org Inspired-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:27 +08:00
return true;
}
static bool rb_time_read(rb_time_t *t, u64 *ret)
{
unsigned long cnt;
return __rb_time_read(t, ret, &cnt);
}
static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt)
{
return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT);
}
static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom,
unsigned long *msb)
ring-buffer: Add rb_time_t 64 bit operations for speeding up 32 bit After a discussion with the new time algorithm to have nested events still have proper time keeping but required using local64_t atomic operations. Mathieu was concerned about the performance this would have on 32 bit machines, as in most cases, atomic 64 bit operations on them can be expensive. As the ring buffer's timing needs do not require full features of local64_t, a wrapper is made to implement a new rb_time_t operation that uses two longs on 32 bit machines but still uses the local64_t operations on 64 bit machines. There's a switch that can be made in the file to force 64 bit to use the 32 bit version just for testing purposes. All reads do not need to succeed if a read happened while the stamp being read is in the process of being updated. The requirement is that all reads must succed that were done by an interrupting event (where this event was interrupted by another event that did the write). Or if the event itself did the write first. That is: rb_time_set(t, x) followed by rb_time_read(t) will always succeed (even if it gets interrupted by another event that writes to t. The result of the read will be either the previous set, or a set performed by an interrupting event. If the read is done by an event that interrupted another event that was in the process of setting the time stamp, and no other event came along to write to that time stamp, it will fail and the rb_time_read() will return that it failed (the value to read will be undefined). A set will always write to the time stamp and return with a valid time stamp, such that any read after it will be valid. A cmpxchg may fail if it interrupted an event that was in the process of updating the time stamp just like the reads do. Other than that, it will act like a normal cmpxchg. The way this works is that the rb_time_t is made of of three fields. A cnt, that gets updated atomically everyting a modification is made. A top that represents the most significant 30 bits of the time, and a bottom to represent the least significant 30 bits of the time. Notice, that the time values is only 60 bits long (where the ring buffer only uses 59 bits, which gives us 18 years of nanoseconds!). The top two bits of both the top and bottom is a 2 bit counter that gets set by the value of the least two significant bits of the cnt. A read of the top and the bottom where both the top and bottom have the same most significant top 2 bits, are considered a match and a valid 60 bit number can be created from it. If they do not match, then the number is considered invalid, and this must only happen if an event interrupted another event in the midst of updating the time stamp. This is only used for 32 bits machines as 64 bit machines can get better performance out of the local64_t. This has been tested heavily by forcing 64 bit to use this logic. Link: https://lore.kernel.org/r/20200625225345.18cf5881@oasis.local.home Link: http://lkml.kernel.org/r/20200629025259.309232719@goodmis.org Inspired-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:27 +08:00
{
*top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK);
*bottom = (unsigned long)(val & RB_TIME_VAL_MASK);
*msb = (unsigned long)(val >> RB_TIME_MSB_SHIFT);
ring-buffer: Add rb_time_t 64 bit operations for speeding up 32 bit After a discussion with the new time algorithm to have nested events still have proper time keeping but required using local64_t atomic operations. Mathieu was concerned about the performance this would have on 32 bit machines, as in most cases, atomic 64 bit operations on them can be expensive. As the ring buffer's timing needs do not require full features of local64_t, a wrapper is made to implement a new rb_time_t operation that uses two longs on 32 bit machines but still uses the local64_t operations on 64 bit machines. There's a switch that can be made in the file to force 64 bit to use the 32 bit version just for testing purposes. All reads do not need to succeed if a read happened while the stamp being read is in the process of being updated. The requirement is that all reads must succed that were done by an interrupting event (where this event was interrupted by another event that did the write). Or if the event itself did the write first. That is: rb_time_set(t, x) followed by rb_time_read(t) will always succeed (even if it gets interrupted by another event that writes to t. The result of the read will be either the previous set, or a set performed by an interrupting event. If the read is done by an event that interrupted another event that was in the process of setting the time stamp, and no other event came along to write to that time stamp, it will fail and the rb_time_read() will return that it failed (the value to read will be undefined). A set will always write to the time stamp and return with a valid time stamp, such that any read after it will be valid. A cmpxchg may fail if it interrupted an event that was in the process of updating the time stamp just like the reads do. Other than that, it will act like a normal cmpxchg. The way this works is that the rb_time_t is made of of three fields. A cnt, that gets updated atomically everyting a modification is made. A top that represents the most significant 30 bits of the time, and a bottom to represent the least significant 30 bits of the time. Notice, that the time values is only 60 bits long (where the ring buffer only uses 59 bits, which gives us 18 years of nanoseconds!). The top two bits of both the top and bottom is a 2 bit counter that gets set by the value of the least two significant bits of the cnt. A read of the top and the bottom where both the top and bottom have the same most significant top 2 bits, are considered a match and a valid 60 bit number can be created from it. If they do not match, then the number is considered invalid, and this must only happen if an event interrupted another event in the midst of updating the time stamp. This is only used for 32 bits machines as 64 bit machines can get better performance out of the local64_t. This has been tested heavily by forcing 64 bit to use this logic. Link: https://lore.kernel.org/r/20200625225345.18cf5881@oasis.local.home Link: http://lkml.kernel.org/r/20200629025259.309232719@goodmis.org Inspired-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:27 +08:00
}
static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt)
{
val = rb_time_val_cnt(val, cnt);
local_set(t, val);
}
static void rb_time_set(rb_time_t *t, u64 val)
{
unsigned long cnt, top, bottom, msb;
ring-buffer: Add rb_time_t 64 bit operations for speeding up 32 bit After a discussion with the new time algorithm to have nested events still have proper time keeping but required using local64_t atomic operations. Mathieu was concerned about the performance this would have on 32 bit machines, as in most cases, atomic 64 bit operations on them can be expensive. As the ring buffer's timing needs do not require full features of local64_t, a wrapper is made to implement a new rb_time_t operation that uses two longs on 32 bit machines but still uses the local64_t operations on 64 bit machines. There's a switch that can be made in the file to force 64 bit to use the 32 bit version just for testing purposes. All reads do not need to succeed if a read happened while the stamp being read is in the process of being updated. The requirement is that all reads must succed that were done by an interrupting event (where this event was interrupted by another event that did the write). Or if the event itself did the write first. That is: rb_time_set(t, x) followed by rb_time_read(t) will always succeed (even if it gets interrupted by another event that writes to t. The result of the read will be either the previous set, or a set performed by an interrupting event. If the read is done by an event that interrupted another event that was in the process of setting the time stamp, and no other event came along to write to that time stamp, it will fail and the rb_time_read() will return that it failed (the value to read will be undefined). A set will always write to the time stamp and return with a valid time stamp, such that any read after it will be valid. A cmpxchg may fail if it interrupted an event that was in the process of updating the time stamp just like the reads do. Other than that, it will act like a normal cmpxchg. The way this works is that the rb_time_t is made of of three fields. A cnt, that gets updated atomically everyting a modification is made. A top that represents the most significant 30 bits of the time, and a bottom to represent the least significant 30 bits of the time. Notice, that the time values is only 60 bits long (where the ring buffer only uses 59 bits, which gives us 18 years of nanoseconds!). The top two bits of both the top and bottom is a 2 bit counter that gets set by the value of the least two significant bits of the cnt. A read of the top and the bottom where both the top and bottom have the same most significant top 2 bits, are considered a match and a valid 60 bit number can be created from it. If they do not match, then the number is considered invalid, and this must only happen if an event interrupted another event in the midst of updating the time stamp. This is only used for 32 bits machines as 64 bit machines can get better performance out of the local64_t. This has been tested heavily by forcing 64 bit to use this logic. Link: https://lore.kernel.org/r/20200625225345.18cf5881@oasis.local.home Link: http://lkml.kernel.org/r/20200629025259.309232719@goodmis.org Inspired-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:27 +08:00
rb_time_split(val, &top, &bottom, &msb);
ring-buffer: Add rb_time_t 64 bit operations for speeding up 32 bit After a discussion with the new time algorithm to have nested events still have proper time keeping but required using local64_t atomic operations. Mathieu was concerned about the performance this would have on 32 bit machines, as in most cases, atomic 64 bit operations on them can be expensive. As the ring buffer's timing needs do not require full features of local64_t, a wrapper is made to implement a new rb_time_t operation that uses two longs on 32 bit machines but still uses the local64_t operations on 64 bit machines. There's a switch that can be made in the file to force 64 bit to use the 32 bit version just for testing purposes. All reads do not need to succeed if a read happened while the stamp being read is in the process of being updated. The requirement is that all reads must succed that were done by an interrupting event (where this event was interrupted by another event that did the write). Or if the event itself did the write first. That is: rb_time_set(t, x) followed by rb_time_read(t) will always succeed (even if it gets interrupted by another event that writes to t. The result of the read will be either the previous set, or a set performed by an interrupting event. If the read is done by an event that interrupted another event that was in the process of setting the time stamp, and no other event came along to write to that time stamp, it will fail and the rb_time_read() will return that it failed (the value to read will be undefined). A set will always write to the time stamp and return with a valid time stamp, such that any read after it will be valid. A cmpxchg may fail if it interrupted an event that was in the process of updating the time stamp just like the reads do. Other than that, it will act like a normal cmpxchg. The way this works is that the rb_time_t is made of of three fields. A cnt, that gets updated atomically everyting a modification is made. A top that represents the most significant 30 bits of the time, and a bottom to represent the least significant 30 bits of the time. Notice, that the time values is only 60 bits long (where the ring buffer only uses 59 bits, which gives us 18 years of nanoseconds!). The top two bits of both the top and bottom is a 2 bit counter that gets set by the value of the least two significant bits of the cnt. A read of the top and the bottom where both the top and bottom have the same most significant top 2 bits, are considered a match and a valid 60 bit number can be created from it. If they do not match, then the number is considered invalid, and this must only happen if an event interrupted another event in the midst of updating the time stamp. This is only used for 32 bits machines as 64 bit machines can get better performance out of the local64_t. This has been tested heavily by forcing 64 bit to use this logic. Link: https://lore.kernel.org/r/20200625225345.18cf5881@oasis.local.home Link: http://lkml.kernel.org/r/20200629025259.309232719@goodmis.org Inspired-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:27 +08:00
/* Writes always succeed with a valid number even if it gets interrupted. */
do {
cnt = local_inc_return(&t->cnt);
rb_time_val_set(&t->top, top, cnt);
rb_time_val_set(&t->bottom, bottom, cnt);
rb_time_val_set(&t->msb, val >> RB_TIME_MSB_SHIFT, cnt);
ring-buffer: Add rb_time_t 64 bit operations for speeding up 32 bit After a discussion with the new time algorithm to have nested events still have proper time keeping but required using local64_t atomic operations. Mathieu was concerned about the performance this would have on 32 bit machines, as in most cases, atomic 64 bit operations on them can be expensive. As the ring buffer's timing needs do not require full features of local64_t, a wrapper is made to implement a new rb_time_t operation that uses two longs on 32 bit machines but still uses the local64_t operations on 64 bit machines. There's a switch that can be made in the file to force 64 bit to use the 32 bit version just for testing purposes. All reads do not need to succeed if a read happened while the stamp being read is in the process of being updated. The requirement is that all reads must succed that were done by an interrupting event (where this event was interrupted by another event that did the write). Or if the event itself did the write first. That is: rb_time_set(t, x) followed by rb_time_read(t) will always succeed (even if it gets interrupted by another event that writes to t. The result of the read will be either the previous set, or a set performed by an interrupting event. If the read is done by an event that interrupted another event that was in the process of setting the time stamp, and no other event came along to write to that time stamp, it will fail and the rb_time_read() will return that it failed (the value to read will be undefined). A set will always write to the time stamp and return with a valid time stamp, such that any read after it will be valid. A cmpxchg may fail if it interrupted an event that was in the process of updating the time stamp just like the reads do. Other than that, it will act like a normal cmpxchg. The way this works is that the rb_time_t is made of of three fields. A cnt, that gets updated atomically everyting a modification is made. A top that represents the most significant 30 bits of the time, and a bottom to represent the least significant 30 bits of the time. Notice, that the time values is only 60 bits long (where the ring buffer only uses 59 bits, which gives us 18 years of nanoseconds!). The top two bits of both the top and bottom is a 2 bit counter that gets set by the value of the least two significant bits of the cnt. A read of the top and the bottom where both the top and bottom have the same most significant top 2 bits, are considered a match and a valid 60 bit number can be created from it. If they do not match, then the number is considered invalid, and this must only happen if an event interrupted another event in the midst of updating the time stamp. This is only used for 32 bits machines as 64 bit machines can get better performance out of the local64_t. This has been tested heavily by forcing 64 bit to use this logic. Link: https://lore.kernel.org/r/20200625225345.18cf5881@oasis.local.home Link: http://lkml.kernel.org/r/20200629025259.309232719@goodmis.org Inspired-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:27 +08:00
} while (cnt != local_read(&t->cnt));
}
static inline bool
rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
{
return local_try_cmpxchg(l, &expect, set);
ring-buffer: Add rb_time_t 64 bit operations for speeding up 32 bit After a discussion with the new time algorithm to have nested events still have proper time keeping but required using local64_t atomic operations. Mathieu was concerned about the performance this would have on 32 bit machines, as in most cases, atomic 64 bit operations on them can be expensive. As the ring buffer's timing needs do not require full features of local64_t, a wrapper is made to implement a new rb_time_t operation that uses two longs on 32 bit machines but still uses the local64_t operations on 64 bit machines. There's a switch that can be made in the file to force 64 bit to use the 32 bit version just for testing purposes. All reads do not need to succeed if a read happened while the stamp being read is in the process of being updated. The requirement is that all reads must succed that were done by an interrupting event (where this event was interrupted by another event that did the write). Or if the event itself did the write first. That is: rb_time_set(t, x) followed by rb_time_read(t) will always succeed (even if it gets interrupted by another event that writes to t. The result of the read will be either the previous set, or a set performed by an interrupting event. If the read is done by an event that interrupted another event that was in the process of setting the time stamp, and no other event came along to write to that time stamp, it will fail and the rb_time_read() will return that it failed (the value to read will be undefined). A set will always write to the time stamp and return with a valid time stamp, such that any read after it will be valid. A cmpxchg may fail if it interrupted an event that was in the process of updating the time stamp just like the reads do. Other than that, it will act like a normal cmpxchg. The way this works is that the rb_time_t is made of of three fields. A cnt, that gets updated atomically everyting a modification is made. A top that represents the most significant 30 bits of the time, and a bottom to represent the least significant 30 bits of the time. Notice, that the time values is only 60 bits long (where the ring buffer only uses 59 bits, which gives us 18 years of nanoseconds!). The top two bits of both the top and bottom is a 2 bit counter that gets set by the value of the least two significant bits of the cnt. A read of the top and the bottom where both the top and bottom have the same most significant top 2 bits, are considered a match and a valid 60 bit number can be created from it. If they do not match, then the number is considered invalid, and this must only happen if an event interrupted another event in the midst of updating the time stamp. This is only used for 32 bits machines as 64 bit machines can get better performance out of the local64_t. This has been tested heavily by forcing 64 bit to use this logic. Link: https://lore.kernel.org/r/20200625225345.18cf5881@oasis.local.home Link: http://lkml.kernel.org/r/20200629025259.309232719@goodmis.org Inspired-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:27 +08:00
}
#else /* 64 bits */
/* local64_t always succeeds */
static inline bool rb_time_read(rb_time_t *t, u64 *ret)
{
*ret = local64_read(&t->time);
return true;
}
static void rb_time_set(rb_time_t *t, u64 val)
{
local64_set(&t->time, val);
}
#endif
/*
* Enable this to make sure that the event passed to
* ring_buffer_event_time_stamp() is not committed and also
* is on the buffer that it passed in.
*/
//#define RB_VERIFY_EVENT
#ifdef RB_VERIFY_EVENT
static struct list_head *rb_list_head(struct list_head *list);
static void verify_event(struct ring_buffer_per_cpu *cpu_buffer,
void *event)
{
struct buffer_page *page = cpu_buffer->commit_page;
struct buffer_page *tail_page = READ_ONCE(cpu_buffer->tail_page);
struct list_head *next;
long commit, write;
unsigned long addr = (unsigned long)event;
bool done = false;
int stop = 0;
/* Make sure the event exists and is not committed yet */
do {
if (page == tail_page || WARN_ON_ONCE(stop++ > 100))
done = true;
commit = local_read(&page->page->commit);
write = local_read(&page->write);
if (addr >= (unsigned long)&page->page->data[commit] &&
addr < (unsigned long)&page->page->data[write])
return;
next = rb_list_head(page->list.next);
page = list_entry(next, struct buffer_page, list);
} while (!done);
WARN_ON_ONCE(1);
}
#else
static inline void verify_event(struct ring_buffer_per_cpu *cpu_buffer,
void *event)
{
}
#endif
/*
* The absolute time stamp drops the 5 MSBs and some clocks may
* require them. The rb_fix_abs_ts() will take a previous full
* time stamp, and add the 5 MSB of that time stamp on to the
* saved absolute time stamp. Then they are compared in case of
* the unlikely event that the latest time stamp incremented
* the 5 MSB.
*/
static inline u64 rb_fix_abs_ts(u64 abs, u64 save_ts)
{
if (save_ts & TS_MSB) {
abs |= save_ts & TS_MSB;
/* Check for overflow */
if (unlikely(abs < save_ts))
abs += 1ULL << 59;
}
return abs;
}
static inline u64 rb_time_stamp(struct trace_buffer *buffer);
/**
* ring_buffer_event_time_stamp - return the event's current time stamp
* @buffer: The buffer that the event is on
* @event: the event to get the time stamp of
*
* Note, this must be called after @event is reserved, and before it is
* committed to the ring buffer. And must be called from the same
* context where the event was reserved (normal, softirq, irq, etc).
*
* Returns the time stamp associated with the current event.
* If the event has an extended time stamp, then that is used as
* the time stamp to return.
* In the highly unlikely case that the event was nested more than
* the max nesting, then the write_stamp of the buffer is returned,
* otherwise current time is returned, but that really neither of
* the last two cases should ever happen.
*/
u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer,
struct ring_buffer_event *event)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[smp_processor_id()];
unsigned int nest;
u64 ts;
/* If the event includes an absolute time, then just use that */
if (event->type_len == RINGBUF_TYPE_TIME_STAMP) {
ts = rb_event_time_stamp(event);
return rb_fix_abs_ts(ts, cpu_buffer->tail_page->page->time_stamp);
}
nest = local_read(&cpu_buffer->committing);
verify_event(cpu_buffer, event);
if (WARN_ON_ONCE(!nest))
goto fail;
/* Read the current saved nesting level time stamp */
if (likely(--nest < MAX_NEST))
return cpu_buffer->event_stamp[nest];
/* Shouldn't happen, warn if it does */
WARN_ONCE(1, "nest (%d) greater than max", nest);
fail:
/* Can only fail on 32 bit */
if (!rb_time_read(&cpu_buffer->write_stamp, &ts))
/* Screw it, just read the current time */
ts = rb_time_stamp(cpu_buffer->buffer);
return ts;
}
/**
* ring_buffer_nr_pages - get the number of buffer pages in the ring buffer
* @buffer: The ring_buffer to get the number of pages from
* @cpu: The cpu of the ring_buffer to get the number of pages from
*
* Returns the number of pages used by a per_cpu buffer of the ring buffer.
*/
size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu)
{
return buffer->buffers[cpu]->nr_pages;
}
/**
* ring_buffer_nr_dirty_pages - get the number of used pages in the ring buffer
* @buffer: The ring_buffer to get the number of pages from
* @cpu: The cpu of the ring_buffer to get the number of pages from
*
* Returns the number of pages that have content in the ring buffer.
*/
size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu)
{
size_t read;
size_t lost;
size_t cnt;
read = local_read(&buffer->buffers[cpu]->pages_read);
lost = local_read(&buffer->buffers[cpu]->pages_lost);
cnt = local_read(&buffer->buffers[cpu]->pages_touched);
if (WARN_ON_ONCE(cnt < lost))
return 0;
cnt -= lost;
/* The reader can read an empty page, but not more than that */
if (cnt < read) {
WARN_ON_ONCE(read > cnt + 1);
return 0;
}
return cnt - read;
}
static __always_inline bool full_hit(struct trace_buffer *buffer, int cpu, int full)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
size_t nr_pages;
size_t dirty;
nr_pages = cpu_buffer->nr_pages;
if (!nr_pages || !full)
return true;
dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
return (dirty * 100) > (full * nr_pages);
}
/*
* rb_wake_up_waiters - wake up tasks waiting for ring buffer input
*
* Schedules a delayed work to wake up any task that is blocked on the
* ring buffer waiters queue.
*/
static void rb_wake_up_waiters(struct irq_work *work)
{
struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
wake_up_all(&rbwork->waiters);
if (rbwork->full_waiters_pending || rbwork->wakeup_full) {
ring-buffer: Do not wake up a splice waiter when page is not full When an application connects to the ring buffer via splice, it can only read full pages. Splice does not work with partial pages. If there is not enough data to fill a page, the splice command will either block or return -EAGAIN (if set to nonblock). Code was added where if the page is not full, to just sleep again. The problem is, it will get woken up again on the next event. That is, when something is written into the ring buffer, if there is a waiter it will wake it up. The waiter would then check the buffer, see that it still does not have enough data to fill a page and go back to sleep. To make matters worse, when the waiter goes back to sleep, it could cause another event, which would wake it back up again to see it doesn't have enough data and sleep again. This produces a tremendous overhead and fills the ring buffer with noise. For example, recording sched_switch on an idle system for 10 seconds produces 25,350,475 events!!! Create another wait queue for those waiters wanting full pages. When an event is written, it only wakes up waiters if there's a full page of data. It does not wake up the waiter if the page is not yet full. After this change, recording sched_switch on an idle system for 10 seconds produces only 800 events. Getting rid of 25,349,675 useless events (99.9969% of events!!), is something to take seriously. Cc: stable@vger.kernel.org # 3.16+ Cc: Rabin Vincent <rabin@rab.in> Fixes: e30f53aad220 "tracing: Do not busy wait in buffer splice" Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-02-11 11:14:53 +08:00
rbwork->wakeup_full = false;
rbwork->full_waiters_pending = false;
ring-buffer: Do not wake up a splice waiter when page is not full When an application connects to the ring buffer via splice, it can only read full pages. Splice does not work with partial pages. If there is not enough data to fill a page, the splice command will either block or return -EAGAIN (if set to nonblock). Code was added where if the page is not full, to just sleep again. The problem is, it will get woken up again on the next event. That is, when something is written into the ring buffer, if there is a waiter it will wake it up. The waiter would then check the buffer, see that it still does not have enough data to fill a page and go back to sleep. To make matters worse, when the waiter goes back to sleep, it could cause another event, which would wake it back up again to see it doesn't have enough data and sleep again. This produces a tremendous overhead and fills the ring buffer with noise. For example, recording sched_switch on an idle system for 10 seconds produces 25,350,475 events!!! Create another wait queue for those waiters wanting full pages. When an event is written, it only wakes up waiters if there's a full page of data. It does not wake up the waiter if the page is not yet full. After this change, recording sched_switch on an idle system for 10 seconds produces only 800 events. Getting rid of 25,349,675 useless events (99.9969% of events!!), is something to take seriously. Cc: stable@vger.kernel.org # 3.16+ Cc: Rabin Vincent <rabin@rab.in> Fixes: e30f53aad220 "tracing: Do not busy wait in buffer splice" Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-02-11 11:14:53 +08:00
wake_up_all(&rbwork->full_waiters);
}
}
/**
* ring_buffer_wake_waiters - wake up any waiters on this ring buffer
* @buffer: The ring buffer to wake waiters on
* @cpu: The CPU buffer to wake waiters on
*
* In the case of a file that represents a ring buffer is closing,
* it is prudent to wake up any waiters that are on this.
*/
void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct rb_irq_work *rbwork;
ring-buffer: Check for NULL cpu_buffer in ring_buffer_wake_waiters() On some machines the number of listed CPUs may be bigger than the actual CPUs that exist. The tracing subsystem allocates a per_cpu directory with access to the per CPU ring buffer via a cpuX file. But to save space, the ring buffer will only allocate buffers for online CPUs, even though the CPU array will be as big as the nr_cpu_ids. With the addition of waking waiters on the ring buffer when closing the file, the ring_buffer_wake_waiters() now needs to make sure that the buffer is allocated (with the irq_work allocated with it) before trying to wake waiters, as it will cause a NULL pointer dereference. While debugging this, I added a NULL check for the buffer itself (which is OK to do), and also NULL pointer checks against buffer->buffers (which is not fine, and will WARN) as well as making sure the CPU number passed in is within the nr_cpu_ids (which is also not fine if it isn't). Link: https://lore.kernel.org/all/87h6zklb6n.wl-tiwai@suse.de/ Link: https://lore.kernel.org/all/CAM6Wdxc0KRJMXVAA0Y=u6Jh2V=uWB-_Fn6M4xRuNppfXzL1mUg@mail.gmail.com/ Link: https://lkml.kernel.org/linux-trace-kernel/20221101191009.1e7378c8@rorschach.local.home Cc: stable@vger.kernel.org Cc: Steven Noonan <steven.noonan@gmail.com> Bugzilla: https://bugzilla.opensuse.org/show_bug.cgi?id=1204705 Reported-by: Takashi Iwai <tiwai@suse.de> Reported-by: Roland Ruckerbauer <roland.rucky@gmail.com> Fixes: f3ddb74ad079 ("tracing: Wake up ring buffer waiters on closing of the file") Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2022-11-02 07:10:09 +08:00
if (!buffer)
return;
if (cpu == RING_BUFFER_ALL_CPUS) {
/* Wake up individual ones too. One level recursion */
for_each_buffer_cpu(buffer, cpu)
ring_buffer_wake_waiters(buffer, cpu);
rbwork = &buffer->irq_work;
} else {
ring-buffer: Check for NULL cpu_buffer in ring_buffer_wake_waiters() On some machines the number of listed CPUs may be bigger than the actual CPUs that exist. The tracing subsystem allocates a per_cpu directory with access to the per CPU ring buffer via a cpuX file. But to save space, the ring buffer will only allocate buffers for online CPUs, even though the CPU array will be as big as the nr_cpu_ids. With the addition of waking waiters on the ring buffer when closing the file, the ring_buffer_wake_waiters() now needs to make sure that the buffer is allocated (with the irq_work allocated with it) before trying to wake waiters, as it will cause a NULL pointer dereference. While debugging this, I added a NULL check for the buffer itself (which is OK to do), and also NULL pointer checks against buffer->buffers (which is not fine, and will WARN) as well as making sure the CPU number passed in is within the nr_cpu_ids (which is also not fine if it isn't). Link: https://lore.kernel.org/all/87h6zklb6n.wl-tiwai@suse.de/ Link: https://lore.kernel.org/all/CAM6Wdxc0KRJMXVAA0Y=u6Jh2V=uWB-_Fn6M4xRuNppfXzL1mUg@mail.gmail.com/ Link: https://lkml.kernel.org/linux-trace-kernel/20221101191009.1e7378c8@rorschach.local.home Cc: stable@vger.kernel.org Cc: Steven Noonan <steven.noonan@gmail.com> Bugzilla: https://bugzilla.opensuse.org/show_bug.cgi?id=1204705 Reported-by: Takashi Iwai <tiwai@suse.de> Reported-by: Roland Ruckerbauer <roland.rucky@gmail.com> Fixes: f3ddb74ad079 ("tracing: Wake up ring buffer waiters on closing of the file") Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2022-11-02 07:10:09 +08:00
if (WARN_ON_ONCE(!buffer->buffers))
return;
if (WARN_ON_ONCE(cpu >= nr_cpu_ids))
return;
cpu_buffer = buffer->buffers[cpu];
ring-buffer: Check for NULL cpu_buffer in ring_buffer_wake_waiters() On some machines the number of listed CPUs may be bigger than the actual CPUs that exist. The tracing subsystem allocates a per_cpu directory with access to the per CPU ring buffer via a cpuX file. But to save space, the ring buffer will only allocate buffers for online CPUs, even though the CPU array will be as big as the nr_cpu_ids. With the addition of waking waiters on the ring buffer when closing the file, the ring_buffer_wake_waiters() now needs to make sure that the buffer is allocated (with the irq_work allocated with it) before trying to wake waiters, as it will cause a NULL pointer dereference. While debugging this, I added a NULL check for the buffer itself (which is OK to do), and also NULL pointer checks against buffer->buffers (which is not fine, and will WARN) as well as making sure the CPU number passed in is within the nr_cpu_ids (which is also not fine if it isn't). Link: https://lore.kernel.org/all/87h6zklb6n.wl-tiwai@suse.de/ Link: https://lore.kernel.org/all/CAM6Wdxc0KRJMXVAA0Y=u6Jh2V=uWB-_Fn6M4xRuNppfXzL1mUg@mail.gmail.com/ Link: https://lkml.kernel.org/linux-trace-kernel/20221101191009.1e7378c8@rorschach.local.home Cc: stable@vger.kernel.org Cc: Steven Noonan <steven.noonan@gmail.com> Bugzilla: https://bugzilla.opensuse.org/show_bug.cgi?id=1204705 Reported-by: Takashi Iwai <tiwai@suse.de> Reported-by: Roland Ruckerbauer <roland.rucky@gmail.com> Fixes: f3ddb74ad079 ("tracing: Wake up ring buffer waiters on closing of the file") Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2022-11-02 07:10:09 +08:00
/* The CPU buffer may not have been initialized yet */
if (!cpu_buffer)
return;
rbwork = &cpu_buffer->irq_work;
}
rbwork->wait_index++;
/* make sure the waiters see the new index */
smp_wmb();
rb_wake_up_waiters(&rbwork->work);
}
/**
* ring_buffer_wait - wait for input to the ring buffer
* @buffer: buffer to wait on
* @cpu: the cpu buffer to wait on
* @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS
*
* If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
* as data is added to any of the @buffer's cpu buffers. Otherwise
* it will wait for data to be added to a specific cpu buffer.
*/
int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
{
treewide: Remove uninitialized_var() usage Using uninitialized_var() is dangerous as it papers over real bugs[1] (or can in the future), and suppresses unrelated compiler warnings (e.g. "unused variable"). If the compiler thinks it is uninitialized, either simply initialize the variable or make compiler changes. In preparation for removing[2] the[3] macro[4], remove all remaining needless uses with the following script: git grep '\buninitialized_var\b' | cut -d: -f1 | sort -u | \ xargs perl -pi -e \ 's/\buninitialized_var\(([^\)]+)\)/\1/g; s:\s*/\* (GCC be quiet|to make compiler happy) \*/$::g;' drivers/video/fbdev/riva/riva_hw.c was manually tweaked to avoid pathological white-space. No outstanding warnings were found building allmodconfig with GCC 9.3.0 for x86_64, i386, arm64, arm, powerpc, powerpc64le, s390x, mips, sparc64, alpha, and m68k. [1] https://lore.kernel.org/lkml/20200603174714.192027-1-glider@google.com/ [2] https://lore.kernel.org/lkml/CA+55aFw+Vbj0i=1TGqCR5vQkCzWJ0QxK6CernOU6eedsudAixw@mail.gmail.com/ [3] https://lore.kernel.org/lkml/CA+55aFwgbgqhbp1fkxvRKEpzyR5J8n1vKT1VZdz9knmPuXhOeg@mail.gmail.com/ [4] https://lore.kernel.org/lkml/CA+55aFz2500WfbKXAx8s67wrm9=yVJu65TpLgN_ybYNv0VEOKA@mail.gmail.com/ Reviewed-by: Leon Romanovsky <leonro@mellanox.com> # drivers/infiniband and mlx4/mlx5 Acked-by: Jason Gunthorpe <jgg@mellanox.com> # IB Acked-by: Kalle Valo <kvalo@codeaurora.org> # wireless drivers Reviewed-by: Chao Yu <yuchao0@huawei.com> # erofs Signed-off-by: Kees Cook <keescook@chromium.org>
2020-06-04 04:09:38 +08:00
struct ring_buffer_per_cpu *cpu_buffer;
DEFINE_WAIT(wait);
struct rb_irq_work *work;
long wait_index;
tracing: Do not busy wait in buffer splice On a !PREEMPT kernel, attempting to use trace-cmd results in a soft lockup: # trace-cmd record -e raw_syscalls:* -F false NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [trace-cmd:61] ... Call Trace: [<ffffffff8105b580>] ? __wake_up_common+0x90/0x90 [<ffffffff81092e25>] wait_on_pipe+0x35/0x40 [<ffffffff810936e3>] tracing_buffers_splice_read+0x2e3/0x3c0 [<ffffffff81093300>] ? tracing_stats_read+0x2a0/0x2a0 [<ffffffff812d10ab>] ? _raw_spin_unlock+0x2b/0x40 [<ffffffff810dc87b>] ? do_read_fault+0x21b/0x290 [<ffffffff810de56a>] ? handle_mm_fault+0x2ba/0xbd0 [<ffffffff81095c80>] ? trace_event_buffer_lock_reserve+0x40/0x80 [<ffffffff810951e2>] ? trace_buffer_lock_reserve+0x22/0x60 [<ffffffff81095c80>] ? trace_event_buffer_lock_reserve+0x40/0x80 [<ffffffff8112415d>] do_splice_to+0x6d/0x90 [<ffffffff81126971>] SyS_splice+0x7c1/0x800 [<ffffffff812d1edd>] tracesys_phase2+0xd3/0xd8 The problem is this: tracing_buffers_splice_read() calls ring_buffer_wait() to wait for data in the ring buffers. The buffers are not empty so ring_buffer_wait() returns immediately. But tracing_buffers_splice_read() calls ring_buffer_read_page() with full=1, meaning it only wants to read a full page. When the full page is not available, tracing_buffers_splice_read() tries to wait again with ring_buffer_wait(), which again returns immediately, and so on. Fix this by adding a "full" argument to ring_buffer_wait() which will make ring_buffer_wait() wait until the writer has left the reader's page, i.e. until full-page reads will succeed. Link: http://lkml.kernel.org/r/1415645194-25379-1-git-send-email-rabin@rab.in Cc: stable@vger.kernel.org # 3.16+ Fixes: b1169cc69ba9 ("tracing: Remove mock up poll wait function") Signed-off-by: Rabin Vincent <rabin@rab.in> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2014-11-11 02:46:34 +08:00
int ret = 0;
/*
* Depending on what the caller is waiting for, either any
* data in any cpu buffer, or a specific buffer, put the
* caller on the appropriate wait queue.
*/
ring-buffer: Do not wake up a splice waiter when page is not full When an application connects to the ring buffer via splice, it can only read full pages. Splice does not work with partial pages. If there is not enough data to fill a page, the splice command will either block or return -EAGAIN (if set to nonblock). Code was added where if the page is not full, to just sleep again. The problem is, it will get woken up again on the next event. That is, when something is written into the ring buffer, if there is a waiter it will wake it up. The waiter would then check the buffer, see that it still does not have enough data to fill a page and go back to sleep. To make matters worse, when the waiter goes back to sleep, it could cause another event, which would wake it back up again to see it doesn't have enough data and sleep again. This produces a tremendous overhead and fills the ring buffer with noise. For example, recording sched_switch on an idle system for 10 seconds produces 25,350,475 events!!! Create another wait queue for those waiters wanting full pages. When an event is written, it only wakes up waiters if there's a full page of data. It does not wake up the waiter if the page is not yet full. After this change, recording sched_switch on an idle system for 10 seconds produces only 800 events. Getting rid of 25,349,675 useless events (99.9969% of events!!), is something to take seriously. Cc: stable@vger.kernel.org # 3.16+ Cc: Rabin Vincent <rabin@rab.in> Fixes: e30f53aad220 "tracing: Do not busy wait in buffer splice" Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-02-11 11:14:53 +08:00
if (cpu == RING_BUFFER_ALL_CPUS) {
work = &buffer->irq_work;
ring-buffer: Do not wake up a splice waiter when page is not full When an application connects to the ring buffer via splice, it can only read full pages. Splice does not work with partial pages. If there is not enough data to fill a page, the splice command will either block or return -EAGAIN (if set to nonblock). Code was added where if the page is not full, to just sleep again. The problem is, it will get woken up again on the next event. That is, when something is written into the ring buffer, if there is a waiter it will wake it up. The waiter would then check the buffer, see that it still does not have enough data to fill a page and go back to sleep. To make matters worse, when the waiter goes back to sleep, it could cause another event, which would wake it back up again to see it doesn't have enough data and sleep again. This produces a tremendous overhead and fills the ring buffer with noise. For example, recording sched_switch on an idle system for 10 seconds produces 25,350,475 events!!! Create another wait queue for those waiters wanting full pages. When an event is written, it only wakes up waiters if there's a full page of data. It does not wake up the waiter if the page is not yet full. After this change, recording sched_switch on an idle system for 10 seconds produces only 800 events. Getting rid of 25,349,675 useless events (99.9969% of events!!), is something to take seriously. Cc: stable@vger.kernel.org # 3.16+ Cc: Rabin Vincent <rabin@rab.in> Fixes: e30f53aad220 "tracing: Do not busy wait in buffer splice" Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-02-11 11:14:53 +08:00
/* Full only makes sense on per cpu reads */
full = 0;
ring-buffer: Do not wake up a splice waiter when page is not full When an application connects to the ring buffer via splice, it can only read full pages. Splice does not work with partial pages. If there is not enough data to fill a page, the splice command will either block or return -EAGAIN (if set to nonblock). Code was added where if the page is not full, to just sleep again. The problem is, it will get woken up again on the next event. That is, when something is written into the ring buffer, if there is a waiter it will wake it up. The waiter would then check the buffer, see that it still does not have enough data to fill a page and go back to sleep. To make matters worse, when the waiter goes back to sleep, it could cause another event, which would wake it back up again to see it doesn't have enough data and sleep again. This produces a tremendous overhead and fills the ring buffer with noise. For example, recording sched_switch on an idle system for 10 seconds produces 25,350,475 events!!! Create another wait queue for those waiters wanting full pages. When an event is written, it only wakes up waiters if there's a full page of data. It does not wake up the waiter if the page is not yet full. After this change, recording sched_switch on an idle system for 10 seconds produces only 800 events. Getting rid of 25,349,675 useless events (99.9969% of events!!), is something to take seriously. Cc: stable@vger.kernel.org # 3.16+ Cc: Rabin Vincent <rabin@rab.in> Fixes: e30f53aad220 "tracing: Do not busy wait in buffer splice" Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-02-11 11:14:53 +08:00
} else {
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return -ENODEV;
cpu_buffer = buffer->buffers[cpu];
work = &cpu_buffer->irq_work;
}
wait_index = READ_ONCE(work->wait_index);
tracing: Do not busy wait in buffer splice On a !PREEMPT kernel, attempting to use trace-cmd results in a soft lockup: # trace-cmd record -e raw_syscalls:* -F false NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [trace-cmd:61] ... Call Trace: [<ffffffff8105b580>] ? __wake_up_common+0x90/0x90 [<ffffffff81092e25>] wait_on_pipe+0x35/0x40 [<ffffffff810936e3>] tracing_buffers_splice_read+0x2e3/0x3c0 [<ffffffff81093300>] ? tracing_stats_read+0x2a0/0x2a0 [<ffffffff812d10ab>] ? _raw_spin_unlock+0x2b/0x40 [<ffffffff810dc87b>] ? do_read_fault+0x21b/0x290 [<ffffffff810de56a>] ? handle_mm_fault+0x2ba/0xbd0 [<ffffffff81095c80>] ? trace_event_buffer_lock_reserve+0x40/0x80 [<ffffffff810951e2>] ? trace_buffer_lock_reserve+0x22/0x60 [<ffffffff81095c80>] ? trace_event_buffer_lock_reserve+0x40/0x80 [<ffffffff8112415d>] do_splice_to+0x6d/0x90 [<ffffffff81126971>] SyS_splice+0x7c1/0x800 [<ffffffff812d1edd>] tracesys_phase2+0xd3/0xd8 The problem is this: tracing_buffers_splice_read() calls ring_buffer_wait() to wait for data in the ring buffers. The buffers are not empty so ring_buffer_wait() returns immediately. But tracing_buffers_splice_read() calls ring_buffer_read_page() with full=1, meaning it only wants to read a full page. When the full page is not available, tracing_buffers_splice_read() tries to wait again with ring_buffer_wait(), which again returns immediately, and so on. Fix this by adding a "full" argument to ring_buffer_wait() which will make ring_buffer_wait() wait until the writer has left the reader's page, i.e. until full-page reads will succeed. Link: http://lkml.kernel.org/r/1415645194-25379-1-git-send-email-rabin@rab.in Cc: stable@vger.kernel.org # 3.16+ Fixes: b1169cc69ba9 ("tracing: Remove mock up poll wait function") Signed-off-by: Rabin Vincent <rabin@rab.in> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2014-11-11 02:46:34 +08:00
while (true) {
ring-buffer: Do not wake up a splice waiter when page is not full When an application connects to the ring buffer via splice, it can only read full pages. Splice does not work with partial pages. If there is not enough data to fill a page, the splice command will either block or return -EAGAIN (if set to nonblock). Code was added where if the page is not full, to just sleep again. The problem is, it will get woken up again on the next event. That is, when something is written into the ring buffer, if there is a waiter it will wake it up. The waiter would then check the buffer, see that it still does not have enough data to fill a page and go back to sleep. To make matters worse, when the waiter goes back to sleep, it could cause another event, which would wake it back up again to see it doesn't have enough data and sleep again. This produces a tremendous overhead and fills the ring buffer with noise. For example, recording sched_switch on an idle system for 10 seconds produces 25,350,475 events!!! Create another wait queue for those waiters wanting full pages. When an event is written, it only wakes up waiters if there's a full page of data. It does not wake up the waiter if the page is not yet full. After this change, recording sched_switch on an idle system for 10 seconds produces only 800 events. Getting rid of 25,349,675 useless events (99.9969% of events!!), is something to take seriously. Cc: stable@vger.kernel.org # 3.16+ Cc: Rabin Vincent <rabin@rab.in> Fixes: e30f53aad220 "tracing: Do not busy wait in buffer splice" Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-02-11 11:14:53 +08:00
if (full)
prepare_to_wait(&work->full_waiters, &wait, TASK_INTERRUPTIBLE);
else
prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
tracing: Do not busy wait in buffer splice On a !PREEMPT kernel, attempting to use trace-cmd results in a soft lockup: # trace-cmd record -e raw_syscalls:* -F false NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [trace-cmd:61] ... Call Trace: [<ffffffff8105b580>] ? __wake_up_common+0x90/0x90 [<ffffffff81092e25>] wait_on_pipe+0x35/0x40 [<ffffffff810936e3>] tracing_buffers_splice_read+0x2e3/0x3c0 [<ffffffff81093300>] ? tracing_stats_read+0x2a0/0x2a0 [<ffffffff812d10ab>] ? _raw_spin_unlock+0x2b/0x40 [<ffffffff810dc87b>] ? do_read_fault+0x21b/0x290 [<ffffffff810de56a>] ? handle_mm_fault+0x2ba/0xbd0 [<ffffffff81095c80>] ? trace_event_buffer_lock_reserve+0x40/0x80 [<ffffffff810951e2>] ? trace_buffer_lock_reserve+0x22/0x60 [<ffffffff81095c80>] ? trace_event_buffer_lock_reserve+0x40/0x80 [<ffffffff8112415d>] do_splice_to+0x6d/0x90 [<ffffffff81126971>] SyS_splice+0x7c1/0x800 [<ffffffff812d1edd>] tracesys_phase2+0xd3/0xd8 The problem is this: tracing_buffers_splice_read() calls ring_buffer_wait() to wait for data in the ring buffers. The buffers are not empty so ring_buffer_wait() returns immediately. But tracing_buffers_splice_read() calls ring_buffer_read_page() with full=1, meaning it only wants to read a full page. When the full page is not available, tracing_buffers_splice_read() tries to wait again with ring_buffer_wait(), which again returns immediately, and so on. Fix this by adding a "full" argument to ring_buffer_wait() which will make ring_buffer_wait() wait until the writer has left the reader's page, i.e. until full-page reads will succeed. Link: http://lkml.kernel.org/r/1415645194-25379-1-git-send-email-rabin@rab.in Cc: stable@vger.kernel.org # 3.16+ Fixes: b1169cc69ba9 ("tracing: Remove mock up poll wait function") Signed-off-by: Rabin Vincent <rabin@rab.in> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2014-11-11 02:46:34 +08:00
/*
* The events can happen in critical sections where
* checking a work queue can cause deadlocks.
* After adding a task to the queue, this flag is set
* only to notify events to try to wake up the queue
* using irq_work.
*
* We don't clear it even if the buffer is no longer
* empty. The flag only causes the next event to run
* irq_work to do the work queue wake up. The worse
* that can happen if we race with !trace_empty() is that
* an event will cause an irq_work to try to wake up
* an empty queue.
*
* There's no reason to protect this flag either, as
* the work queue and irq_work logic will do the necessary
* synchronization for the wake ups. The only thing
* that is necessary is that the wake up happens after
* a task has been queued. It's OK for spurious wake ups.
*/
ring-buffer: Do not wake up a splice waiter when page is not full When an application connects to the ring buffer via splice, it can only read full pages. Splice does not work with partial pages. If there is not enough data to fill a page, the splice command will either block or return -EAGAIN (if set to nonblock). Code was added where if the page is not full, to just sleep again. The problem is, it will get woken up again on the next event. That is, when something is written into the ring buffer, if there is a waiter it will wake it up. The waiter would then check the buffer, see that it still does not have enough data to fill a page and go back to sleep. To make matters worse, when the waiter goes back to sleep, it could cause another event, which would wake it back up again to see it doesn't have enough data and sleep again. This produces a tremendous overhead and fills the ring buffer with noise. For example, recording sched_switch on an idle system for 10 seconds produces 25,350,475 events!!! Create another wait queue for those waiters wanting full pages. When an event is written, it only wakes up waiters if there's a full page of data. It does not wake up the waiter if the page is not yet full. After this change, recording sched_switch on an idle system for 10 seconds produces only 800 events. Getting rid of 25,349,675 useless events (99.9969% of events!!), is something to take seriously. Cc: stable@vger.kernel.org # 3.16+ Cc: Rabin Vincent <rabin@rab.in> Fixes: e30f53aad220 "tracing: Do not busy wait in buffer splice" Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-02-11 11:14:53 +08:00
if (full)
work->full_waiters_pending = true;
else
work->waiters_pending = true;
tracing: Do not busy wait in buffer splice On a !PREEMPT kernel, attempting to use trace-cmd results in a soft lockup: # trace-cmd record -e raw_syscalls:* -F false NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [trace-cmd:61] ... Call Trace: [<ffffffff8105b580>] ? __wake_up_common+0x90/0x90 [<ffffffff81092e25>] wait_on_pipe+0x35/0x40 [<ffffffff810936e3>] tracing_buffers_splice_read+0x2e3/0x3c0 [<ffffffff81093300>] ? tracing_stats_read+0x2a0/0x2a0 [<ffffffff812d10ab>] ? _raw_spin_unlock+0x2b/0x40 [<ffffffff810dc87b>] ? do_read_fault+0x21b/0x290 [<ffffffff810de56a>] ? handle_mm_fault+0x2ba/0xbd0 [<ffffffff81095c80>] ? trace_event_buffer_lock_reserve+0x40/0x80 [<ffffffff810951e2>] ? trace_buffer_lock_reserve+0x22/0x60 [<ffffffff81095c80>] ? trace_event_buffer_lock_reserve+0x40/0x80 [<ffffffff8112415d>] do_splice_to+0x6d/0x90 [<ffffffff81126971>] SyS_splice+0x7c1/0x800 [<ffffffff812d1edd>] tracesys_phase2+0xd3/0xd8 The problem is this: tracing_buffers_splice_read() calls ring_buffer_wait() to wait for data in the ring buffers. The buffers are not empty so ring_buffer_wait() returns immediately. But tracing_buffers_splice_read() calls ring_buffer_read_page() with full=1, meaning it only wants to read a full page. When the full page is not available, tracing_buffers_splice_read() tries to wait again with ring_buffer_wait(), which again returns immediately, and so on. Fix this by adding a "full" argument to ring_buffer_wait() which will make ring_buffer_wait() wait until the writer has left the reader's page, i.e. until full-page reads will succeed. Link: http://lkml.kernel.org/r/1415645194-25379-1-git-send-email-rabin@rab.in Cc: stable@vger.kernel.org # 3.16+ Fixes: b1169cc69ba9 ("tracing: Remove mock up poll wait function") Signed-off-by: Rabin Vincent <rabin@rab.in> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2014-11-11 02:46:34 +08:00
if (signal_pending(current)) {
ret = -EINTR;
break;
}
if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer))
break;
if (cpu != RING_BUFFER_ALL_CPUS &&
!ring_buffer_empty_cpu(buffer, cpu)) {
unsigned long flags;
bool pagebusy;
bool done;
tracing: Do not busy wait in buffer splice On a !PREEMPT kernel, attempting to use trace-cmd results in a soft lockup: # trace-cmd record -e raw_syscalls:* -F false NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [trace-cmd:61] ... Call Trace: [<ffffffff8105b580>] ? __wake_up_common+0x90/0x90 [<ffffffff81092e25>] wait_on_pipe+0x35/0x40 [<ffffffff810936e3>] tracing_buffers_splice_read+0x2e3/0x3c0 [<ffffffff81093300>] ? tracing_stats_read+0x2a0/0x2a0 [<ffffffff812d10ab>] ? _raw_spin_unlock+0x2b/0x40 [<ffffffff810dc87b>] ? do_read_fault+0x21b/0x290 [<ffffffff810de56a>] ? handle_mm_fault+0x2ba/0xbd0 [<ffffffff81095c80>] ? trace_event_buffer_lock_reserve+0x40/0x80 [<ffffffff810951e2>] ? trace_buffer_lock_reserve+0x22/0x60 [<ffffffff81095c80>] ? trace_event_buffer_lock_reserve+0x40/0x80 [<ffffffff8112415d>] do_splice_to+0x6d/0x90 [<ffffffff81126971>] SyS_splice+0x7c1/0x800 [<ffffffff812d1edd>] tracesys_phase2+0xd3/0xd8 The problem is this: tracing_buffers_splice_read() calls ring_buffer_wait() to wait for data in the ring buffers. The buffers are not empty so ring_buffer_wait() returns immediately. But tracing_buffers_splice_read() calls ring_buffer_read_page() with full=1, meaning it only wants to read a full page. When the full page is not available, tracing_buffers_splice_read() tries to wait again with ring_buffer_wait(), which again returns immediately, and so on. Fix this by adding a "full" argument to ring_buffer_wait() which will make ring_buffer_wait() wait until the writer has left the reader's page, i.e. until full-page reads will succeed. Link: http://lkml.kernel.org/r/1415645194-25379-1-git-send-email-rabin@rab.in Cc: stable@vger.kernel.org # 3.16+ Fixes: b1169cc69ba9 ("tracing: Remove mock up poll wait function") Signed-off-by: Rabin Vincent <rabin@rab.in> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2014-11-11 02:46:34 +08:00
if (!full)
break;
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
done = !pagebusy && full_hit(buffer, cpu, full);
if (!cpu_buffer->shortest_full ||
cpu_buffer->shortest_full > full)
cpu_buffer->shortest_full = full;
tracing: Do not busy wait in buffer splice On a !PREEMPT kernel, attempting to use trace-cmd results in a soft lockup: # trace-cmd record -e raw_syscalls:* -F false NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [trace-cmd:61] ... Call Trace: [<ffffffff8105b580>] ? __wake_up_common+0x90/0x90 [<ffffffff81092e25>] wait_on_pipe+0x35/0x40 [<ffffffff810936e3>] tracing_buffers_splice_read+0x2e3/0x3c0 [<ffffffff81093300>] ? tracing_stats_read+0x2a0/0x2a0 [<ffffffff812d10ab>] ? _raw_spin_unlock+0x2b/0x40 [<ffffffff810dc87b>] ? do_read_fault+0x21b/0x290 [<ffffffff810de56a>] ? handle_mm_fault+0x2ba/0xbd0 [<ffffffff81095c80>] ? trace_event_buffer_lock_reserve+0x40/0x80 [<ffffffff810951e2>] ? trace_buffer_lock_reserve+0x22/0x60 [<ffffffff81095c80>] ? trace_event_buffer_lock_reserve+0x40/0x80 [<ffffffff8112415d>] do_splice_to+0x6d/0x90 [<ffffffff81126971>] SyS_splice+0x7c1/0x800 [<ffffffff812d1edd>] tracesys_phase2+0xd3/0xd8 The problem is this: tracing_buffers_splice_read() calls ring_buffer_wait() to wait for data in the ring buffers. The buffers are not empty so ring_buffer_wait() returns immediately. But tracing_buffers_splice_read() calls ring_buffer_read_page() with full=1, meaning it only wants to read a full page. When the full page is not available, tracing_buffers_splice_read() tries to wait again with ring_buffer_wait(), which again returns immediately, and so on. Fix this by adding a "full" argument to ring_buffer_wait() which will make ring_buffer_wait() wait until the writer has left the reader's page, i.e. until full-page reads will succeed. Link: http://lkml.kernel.org/r/1415645194-25379-1-git-send-email-rabin@rab.in Cc: stable@vger.kernel.org # 3.16+ Fixes: b1169cc69ba9 ("tracing: Remove mock up poll wait function") Signed-off-by: Rabin Vincent <rabin@rab.in> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2014-11-11 02:46:34 +08:00
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
if (done)
tracing: Do not busy wait in buffer splice On a !PREEMPT kernel, attempting to use trace-cmd results in a soft lockup: # trace-cmd record -e raw_syscalls:* -F false NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [trace-cmd:61] ... Call Trace: [<ffffffff8105b580>] ? __wake_up_common+0x90/0x90 [<ffffffff81092e25>] wait_on_pipe+0x35/0x40 [<ffffffff810936e3>] tracing_buffers_splice_read+0x2e3/0x3c0 [<ffffffff81093300>] ? tracing_stats_read+0x2a0/0x2a0 [<ffffffff812d10ab>] ? _raw_spin_unlock+0x2b/0x40 [<ffffffff810dc87b>] ? do_read_fault+0x21b/0x290 [<ffffffff810de56a>] ? handle_mm_fault+0x2ba/0xbd0 [<ffffffff81095c80>] ? trace_event_buffer_lock_reserve+0x40/0x80 [<ffffffff810951e2>] ? trace_buffer_lock_reserve+0x22/0x60 [<ffffffff81095c80>] ? trace_event_buffer_lock_reserve+0x40/0x80 [<ffffffff8112415d>] do_splice_to+0x6d/0x90 [<ffffffff81126971>] SyS_splice+0x7c1/0x800 [<ffffffff812d1edd>] tracesys_phase2+0xd3/0xd8 The problem is this: tracing_buffers_splice_read() calls ring_buffer_wait() to wait for data in the ring buffers. The buffers are not empty so ring_buffer_wait() returns immediately. But tracing_buffers_splice_read() calls ring_buffer_read_page() with full=1, meaning it only wants to read a full page. When the full page is not available, tracing_buffers_splice_read() tries to wait again with ring_buffer_wait(), which again returns immediately, and so on. Fix this by adding a "full" argument to ring_buffer_wait() which will make ring_buffer_wait() wait until the writer has left the reader's page, i.e. until full-page reads will succeed. Link: http://lkml.kernel.org/r/1415645194-25379-1-git-send-email-rabin@rab.in Cc: stable@vger.kernel.org # 3.16+ Fixes: b1169cc69ba9 ("tracing: Remove mock up poll wait function") Signed-off-by: Rabin Vincent <rabin@rab.in> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2014-11-11 02:46:34 +08:00
break;
}
schedule();
/* Make sure to see the new wait index */
smp_rmb();
if (wait_index != work->wait_index)
break;
tracing: Do not busy wait in buffer splice On a !PREEMPT kernel, attempting to use trace-cmd results in a soft lockup: # trace-cmd record -e raw_syscalls:* -F false NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [trace-cmd:61] ... Call Trace: [<ffffffff8105b580>] ? __wake_up_common+0x90/0x90 [<ffffffff81092e25>] wait_on_pipe+0x35/0x40 [<ffffffff810936e3>] tracing_buffers_splice_read+0x2e3/0x3c0 [<ffffffff81093300>] ? tracing_stats_read+0x2a0/0x2a0 [<ffffffff812d10ab>] ? _raw_spin_unlock+0x2b/0x40 [<ffffffff810dc87b>] ? do_read_fault+0x21b/0x290 [<ffffffff810de56a>] ? handle_mm_fault+0x2ba/0xbd0 [<ffffffff81095c80>] ? trace_event_buffer_lock_reserve+0x40/0x80 [<ffffffff810951e2>] ? trace_buffer_lock_reserve+0x22/0x60 [<ffffffff81095c80>] ? trace_event_buffer_lock_reserve+0x40/0x80 [<ffffffff8112415d>] do_splice_to+0x6d/0x90 [<ffffffff81126971>] SyS_splice+0x7c1/0x800 [<ffffffff812d1edd>] tracesys_phase2+0xd3/0xd8 The problem is this: tracing_buffers_splice_read() calls ring_buffer_wait() to wait for data in the ring buffers. The buffers are not empty so ring_buffer_wait() returns immediately. But tracing_buffers_splice_read() calls ring_buffer_read_page() with full=1, meaning it only wants to read a full page. When the full page is not available, tracing_buffers_splice_read() tries to wait again with ring_buffer_wait(), which again returns immediately, and so on. Fix this by adding a "full" argument to ring_buffer_wait() which will make ring_buffer_wait() wait until the writer has left the reader's page, i.e. until full-page reads will succeed. Link: http://lkml.kernel.org/r/1415645194-25379-1-git-send-email-rabin@rab.in Cc: stable@vger.kernel.org # 3.16+ Fixes: b1169cc69ba9 ("tracing: Remove mock up poll wait function") Signed-off-by: Rabin Vincent <rabin@rab.in> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2014-11-11 02:46:34 +08:00
}
ring-buffer: Do not wake up a splice waiter when page is not full When an application connects to the ring buffer via splice, it can only read full pages. Splice does not work with partial pages. If there is not enough data to fill a page, the splice command will either block or return -EAGAIN (if set to nonblock). Code was added where if the page is not full, to just sleep again. The problem is, it will get woken up again on the next event. That is, when something is written into the ring buffer, if there is a waiter it will wake it up. The waiter would then check the buffer, see that it still does not have enough data to fill a page and go back to sleep. To make matters worse, when the waiter goes back to sleep, it could cause another event, which would wake it back up again to see it doesn't have enough data and sleep again. This produces a tremendous overhead and fills the ring buffer with noise. For example, recording sched_switch on an idle system for 10 seconds produces 25,350,475 events!!! Create another wait queue for those waiters wanting full pages. When an event is written, it only wakes up waiters if there's a full page of data. It does not wake up the waiter if the page is not yet full. After this change, recording sched_switch on an idle system for 10 seconds produces only 800 events. Getting rid of 25,349,675 useless events (99.9969% of events!!), is something to take seriously. Cc: stable@vger.kernel.org # 3.16+ Cc: Rabin Vincent <rabin@rab.in> Fixes: e30f53aad220 "tracing: Do not busy wait in buffer splice" Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-02-11 11:14:53 +08:00
if (full)
finish_wait(&work->full_waiters, &wait);
else
finish_wait(&work->waiters, &wait);
tracing: Do not busy wait in buffer splice On a !PREEMPT kernel, attempting to use trace-cmd results in a soft lockup: # trace-cmd record -e raw_syscalls:* -F false NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [trace-cmd:61] ... Call Trace: [<ffffffff8105b580>] ? __wake_up_common+0x90/0x90 [<ffffffff81092e25>] wait_on_pipe+0x35/0x40 [<ffffffff810936e3>] tracing_buffers_splice_read+0x2e3/0x3c0 [<ffffffff81093300>] ? tracing_stats_read+0x2a0/0x2a0 [<ffffffff812d10ab>] ? _raw_spin_unlock+0x2b/0x40 [<ffffffff810dc87b>] ? do_read_fault+0x21b/0x290 [<ffffffff810de56a>] ? handle_mm_fault+0x2ba/0xbd0 [<ffffffff81095c80>] ? trace_event_buffer_lock_reserve+0x40/0x80 [<ffffffff810951e2>] ? trace_buffer_lock_reserve+0x22/0x60 [<ffffffff81095c80>] ? trace_event_buffer_lock_reserve+0x40/0x80 [<ffffffff8112415d>] do_splice_to+0x6d/0x90 [<ffffffff81126971>] SyS_splice+0x7c1/0x800 [<ffffffff812d1edd>] tracesys_phase2+0xd3/0xd8 The problem is this: tracing_buffers_splice_read() calls ring_buffer_wait() to wait for data in the ring buffers. The buffers are not empty so ring_buffer_wait() returns immediately. But tracing_buffers_splice_read() calls ring_buffer_read_page() with full=1, meaning it only wants to read a full page. When the full page is not available, tracing_buffers_splice_read() tries to wait again with ring_buffer_wait(), which again returns immediately, and so on. Fix this by adding a "full" argument to ring_buffer_wait() which will make ring_buffer_wait() wait until the writer has left the reader's page, i.e. until full-page reads will succeed. Link: http://lkml.kernel.org/r/1415645194-25379-1-git-send-email-rabin@rab.in Cc: stable@vger.kernel.org # 3.16+ Fixes: b1169cc69ba9 ("tracing: Remove mock up poll wait function") Signed-off-by: Rabin Vincent <rabin@rab.in> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2014-11-11 02:46:34 +08:00
return ret;
}
/**
* ring_buffer_poll_wait - poll on buffer input
* @buffer: buffer to wait on
* @cpu: the cpu buffer to wait on
* @filp: the file descriptor
* @poll_table: The poll descriptor
* @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS
*
* If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
* as data is added to any of the @buffer's cpu buffers. Otherwise
* it will wait for data to be added to a specific cpu buffer.
*
* Returns EPOLLIN | EPOLLRDNORM if data exists in the buffers,
* zero otherwise.
*/
__poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
struct file *filp, poll_table *poll_table, int full)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct rb_irq_work *work;
if (cpu == RING_BUFFER_ALL_CPUS) {
work = &buffer->irq_work;
full = 0;
} else {
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return -EINVAL;
cpu_buffer = buffer->buffers[cpu];
work = &cpu_buffer->irq_work;
}
if (full) {
poll_wait(filp, &work->full_waiters, poll_table);
work->full_waiters_pending = true;
ring-buffer: Update "shortest_full" in polling It was discovered that the ring buffer polling was incorrectly stating that read would not block, but that's because polling did not take into account that reads will block if the "buffer-percent" was set. Instead, the ring buffer polling would say reads would not block if there was any data in the ring buffer. This was incorrect behavior from a user space point of view. This was fixed by commit 42fb0a1e84ff by having the polling code check if the ring buffer had more data than what the user specified "buffer percent" had. The problem now is that the polling code did not register itself to the writer that it wanted to wait for a specific "full" value of the ring buffer. The result was that the writer would wake the polling waiter whenever there was a new event. The polling waiter would then wake up, see that there's not enough data in the ring buffer to notify user space and then go back to sleep. The next event would wake it up again. Before the polling fix was added, the code would wake up around 100 times for a hackbench 30 benchmark. After the "fix", due to the constant waking of the writer, it would wake up over 11,0000 times! It would never leave the kernel, so the user space behavior was still "correct", but this definitely is not the desired effect. To fix this, have the polling code add what it's waiting for to the "shortest_full" variable, to tell the writer not to wake it up if the buffer is not as full as it expects to be. Note, after this fix, it appears that the waiter is now woken up around 2x the times it was before (~200). This is a tremendous improvement from the 11,000 times, but I will need to spend some time to see why polling is more aggressive in its wakeups than the read blocking code. Link: https://lore.kernel.org/linux-trace-kernel/20230929180113.01c2cae3@rorschach.local.home Cc: stable@vger.kernel.org Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Fixes: 42fb0a1e84ff ("tracing/ring-buffer: Have polling block on watermark") Reported-by: Julia Lawall <julia.lawall@inria.fr> Tested-by: Julia Lawall <julia.lawall@inria.fr> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-09-30 06:01:13 +08:00
if (!cpu_buffer->shortest_full ||
cpu_buffer->shortest_full > full)
cpu_buffer->shortest_full = full;
} else {
poll_wait(filp, &work->waiters, poll_table);
work->waiters_pending = true;
}
/*
* There's a tight race between setting the waiters_pending and
* checking if the ring buffer is empty. Once the waiters_pending bit
* is set, the next event will wake the task up, but we can get stuck
* if there's only a single event in.
*
* FIXME: Ideally, we need a memory barrier on the writer side as well,
* but adding a memory barrier to all events will cause too much of a
* performance hit in the fast path. We only need a memory barrier when
* the buffer goes from empty to having content. But as this race is
* extremely small, and it's not a problem if another event comes in, we
* will fix it later.
*/
smp_mb();
if (full)
return full_hit(buffer, cpu, full) ? EPOLLIN | EPOLLRDNORM : 0;
if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
(cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
return EPOLLIN | EPOLLRDNORM;
return 0;
}
/* buffer may be either ring_buffer or ring_buffer_per_cpu */
#define RB_WARN_ON(b, cond) \
({ \
int _____ret = unlikely(cond); \
if (_____ret) { \
if (__same_type(*(b), struct ring_buffer_per_cpu)) { \
struct ring_buffer_per_cpu *__b = \
(void *)b; \
atomic_inc(&__b->buffer->record_disabled); \
} else \
atomic_inc(&b->record_disabled); \
WARN_ON(1); \
} \
_____ret; \
})
/* Up this if you want to test the TIME_EXTENTS and normalization */
#define DEBUG_SHIFT 0
static inline u64 rb_time_stamp(struct trace_buffer *buffer)
{
u64 ts;
/* Skip retpolines :-( */
if (IS_ENABLED(CONFIG_RETPOLINE) && likely(buffer->clock == trace_clock_local))
ts = trace_clock_local();
else
ts = buffer->clock();
/* shift to debug/test normalization and TIME_EXTENTS */
return ts << DEBUG_SHIFT;
}
u64 ring_buffer_time_stamp(struct trace_buffer *buffer)
{
u64 time;
preempt_disable_notrace();
time = rb_time_stamp(buffer);
preempt_enable_notrace();
return time;
}
EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
void ring_buffer_normalize_time_stamp(struct trace_buffer *buffer,
int cpu, u64 *ts)
{
/* Just stupid testing the normalize function and deltas */
*ts >>= DEBUG_SHIFT;
}
EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
/*
* Making the ring buffer lockless makes things tricky.
* Although writes only happen on the CPU that they are on,
* and they only need to worry about interrupts. Reads can
* happen on any CPU.
*
* The reader page is always off the ring buffer, but when the
* reader finishes with a page, it needs to swap its page with
* a new one from the buffer. The reader needs to take from
* the head (writes go to the tail). But if a writer is in overwrite
* mode and wraps, it must push the head page forward.
*
* Here lies the problem.
*
* The reader must be careful to replace only the head page, and
* not another one. As described at the top of the file in the
* ASCII art, the reader sets its old page to point to the next
* page after head. It then sets the page after head to point to
* the old reader page. But if the writer moves the head page
* during this operation, the reader could end up with the tail.
*
* We use cmpxchg to help prevent this race. We also do something
* special with the page before head. We set the LSB to 1.
*
* When the writer must push the page forward, it will clear the
* bit that points to the head page, move the head, and then set
* the bit that points to the new head page.
*
* We also don't want an interrupt coming in and moving the head
* page on another writer. Thus we use the second LSB to catch
* that too. Thus:
*
* head->list->prev->next bit 1 bit 0
* ------- -------
* Normal page 0 0
* Points to head page 0 1
* New head page 1 0
*
* Note we can not trust the prev pointer of the head page, because:
*
* +----+ +-----+ +-----+
* | |------>| T |---X--->| N |
* | |<------| | | |
* +----+ +-----+ +-----+
* ^ ^ |
* | +-----+ | |
* +----------| R |----------+ |
* | |<-----------+
* +-----+
*
* Key: ---X--> HEAD flag set in pointer
* T Tail page
* R Reader page
* N Next page
*
* (see __rb_reserve_next() to see where this happens)
*
* What the above shows is that the reader just swapped out
* the reader page with a page in the buffer, but before it
* could make the new header point back to the new page added
* it was preempted by a writer. The writer moved forward onto
* the new page added by the reader and is about to move forward
* again.
*
* You can see, it is legitimate for the previous pointer of
* the head (or any page) not to point back to itself. But only
* temporarily.
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
*/
#define RB_PAGE_NORMAL 0UL
#define RB_PAGE_HEAD 1UL
#define RB_PAGE_UPDATE 2UL
#define RB_FLAG_MASK 3UL
/* PAGE_MOVED is not part of the mask */
#define RB_PAGE_MOVED 4UL
/*
* rb_list_head - remove any bit
*/
static struct list_head *rb_list_head(struct list_head *list)
{
unsigned long val = (unsigned long)list;
return (struct list_head *)(val & ~RB_FLAG_MASK);
}
/*
* rb_is_head_page - test if the given page is the head page
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
*
* Because the reader may move the head_page pointer, we can
* not trust what the head page is (it may be pointing to
* the reader page). But if the next page is a header page,
* its flags will be non zero.
*/
static inline int
rb_is_head_page(struct buffer_page *page, struct list_head *list)
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
{
unsigned long val;
val = (unsigned long)list->next;
if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list)
return RB_PAGE_MOVED;
return val & RB_FLAG_MASK;
}
/*
* rb_is_reader_page
*
* The unique thing about the reader page, is that, if the
* writer is ever on it, the previous pointer never points
* back to the reader page.
*/
static bool rb_is_reader_page(struct buffer_page *page)
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
{
struct list_head *list = page->list.prev;
return rb_list_head(list->next) != &page->list;
}
/*
* rb_set_list_to_head - set a list_head to be pointing to head.
*/
static void rb_set_list_to_head(struct list_head *list)
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
{
unsigned long *ptr;
ptr = (unsigned long *)&list->next;
*ptr |= RB_PAGE_HEAD;
*ptr &= ~RB_PAGE_UPDATE;
}
/*
* rb_head_page_activate - sets up head page
*/
static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *head;
head = cpu_buffer->head_page;
if (!head)
return;
/*
* Set the previous list pointer to have the HEAD flag.
*/
rb_set_list_to_head(head->list.prev);
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
}
static void rb_list_head_clear(struct list_head *list)
{
unsigned long *ptr = (unsigned long *)&list->next;
*ptr &= ~RB_FLAG_MASK;
}
/*
* rb_head_page_deactivate - clears head page ptr (for free list)
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
*/
static void
rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer)
{
struct list_head *hd;
/* Go through the whole list and clear any pointers found. */
rb_list_head_clear(cpu_buffer->pages);
list_for_each(hd, cpu_buffer->pages)
rb_list_head_clear(hd);
}
static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *head,
struct buffer_page *prev,
int old_flag, int new_flag)
{
struct list_head *list;
unsigned long val = (unsigned long)&head->list;
unsigned long ret;
list = &prev->list;
val &= ~RB_FLAG_MASK;
ret = cmpxchg((unsigned long *)&list->next,
val | old_flag, val | new_flag);
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
/* check if the reader took the page */
if ((ret & ~RB_FLAG_MASK) != val)
return RB_PAGE_MOVED;
return ret & RB_FLAG_MASK;
}
static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *head,
struct buffer_page *prev,
int old_flag)
{
return rb_head_page_set(cpu_buffer, head, prev,
old_flag, RB_PAGE_UPDATE);
}
static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *head,
struct buffer_page *prev,
int old_flag)
{
return rb_head_page_set(cpu_buffer, head, prev,
old_flag, RB_PAGE_HEAD);
}
static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *head,
struct buffer_page *prev,
int old_flag)
{
return rb_head_page_set(cpu_buffer, head, prev,
old_flag, RB_PAGE_NORMAL);
}
static inline void rb_inc_page(struct buffer_page **bpage)
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
{
struct list_head *p = rb_list_head((*bpage)->list.next);
*bpage = list_entry(p, struct buffer_page, list);
}
static struct buffer_page *
rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *head;
struct buffer_page *page;
struct list_head *list;
int i;
if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page))
return NULL;
/* sanity check */
list = cpu_buffer->pages;
if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list))
return NULL;
page = head = cpu_buffer->head_page;
/*
* It is possible that the writer moves the header behind
* where we started, and we miss in one loop.
* A second loop should grab the header, but we'll do
* three loops just because I'm paranoid.
*/
for (i = 0; i < 3; i++) {
do {
if (rb_is_head_page(page, page->list.prev)) {
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
cpu_buffer->head_page = page;
return page;
}
rb_inc_page(&page);
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
} while (page != head);
}
RB_WARN_ON(cpu_buffer, 1);
return NULL;
}
static bool rb_head_page_replace(struct buffer_page *old,
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
struct buffer_page *new)
{
unsigned long *ptr = (unsigned long *)&old->list.prev->next;
unsigned long val;
val = *ptr & ~RB_FLAG_MASK;
val |= RB_PAGE_HEAD;
return try_cmpxchg(ptr, &val, (unsigned long)&new->list);
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
}
/*
* rb_tail_page_update - move the tail page forward
*/
static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
struct buffer_page *tail_page,
struct buffer_page *next_page)
{
unsigned long old_entries;
unsigned long old_write;
/*
* The tail page now needs to be moved forward.
*
* We need to reset the tail page, but without messing
* with possible erasing of data brought in by interrupts
* that have moved the tail page and are currently on it.
*
* We add a counter to the write field to denote this.
*/
old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write);
old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries);
local_inc(&cpu_buffer->pages_touched);
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
/*
* Just make sure we have seen our old_write and synchronize
* with any interrupts that come in.
*/
barrier();
/*
* If the tail page is still the same as what we think
* it is, then it is up to us to update the tail
* pointer.
*/
if (tail_page == READ_ONCE(cpu_buffer->tail_page)) {
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
/* Zero the write counter */
unsigned long val = old_write & ~RB_WRITE_MASK;
unsigned long eval = old_entries & ~RB_WRITE_MASK;
/*
* This will only succeed if an interrupt did
* not come in and change it. In which case, we
* do not want to modify it.
*
* We add (void) to let the compiler know that we do not care
* about the return value of these functions. We use the
* cmpxchg to only update if an interrupt did not already
* do it for us. If the cmpxchg fails, we don't care.
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
*/
(void)local_cmpxchg(&next_page->write, old_write, val);
(void)local_cmpxchg(&next_page->entries, old_entries, eval);
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
/*
* No need to worry about races with clearing out the commit.
* it only can increment when a commit takes place. But that
* only happens in the outer most nested commit.
*/
local_set(&next_page->page->commit, 0);
/* Again, either we update tail_page or an interrupt does */
(void)cmpxchg(&cpu_buffer->tail_page, tail_page, next_page);
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
}
}
static void rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
struct buffer_page *bpage)
{
unsigned long val = (unsigned long)bpage;
RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK);
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
}
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/**
* rb_check_pages - integrity check of buffer pages
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
* @cpu_buffer: CPU buffer with pages to test
*
* As a safety measure we check to make sure the data pages have not
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
* been corrupted.
*/
static void rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
ring-buffer: Handle race between rb_move_tail and rb_check_pages It seems a data race between ring_buffer writing and integrity check. That is, RB_FLAG of head_page is been updating, while at same time RB_FLAG was cleared when doing integrity check rb_check_pages(): rb_check_pages() rb_handle_head_page(): -------- -------- rb_head_page_deactivate() rb_head_page_set_normal() rb_head_page_activate() We do intergrity test of the list to check if the list is corrupted and it is still worth doing it. So, let's refactor rb_check_pages() such that we no longer clear and set flag during the list sanity checking. [1] and [2] are the test to reproduce and the crash report respectively. 1: ``` read_trace.sh while true; do # the "trace" file is closed after read head -1 /sys/kernel/tracing/trace > /dev/null done ``` ``` repro.sh sysctl -w kernel.panic_on_warn=1 # function tracer will writing enough data into ring_buffer echo function > /sys/kernel/tracing/current_tracer ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ``` 2: ------------[ cut here ]------------ WARNING: CPU: 9 PID: 62 at kernel/trace/ring_buffer.c:2653 rb_move_tail+0x450/0x470 Modules linked in: CPU: 9 PID: 62 Comm: ksoftirqd/9 Tainted: G W 6.2.0-rc6+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 RIP: 0010:rb_move_tail+0x450/0x470 Code: ff ff 4c 89 c8 f0 4d 0f b1 02 48 89 c2 48 83 e2 fc 49 39 d0 75 24 83 e0 03 83 f8 02 0f 84 e1 fb ff ff 48 8b 57 10 f0 ff 42 08 <0f> 0b 83 f8 02 0f 84 ce fb ff ff e9 db RSP: 0018:ffffb5564089bd00 EFLAGS: 00000203 RAX: 0000000000000000 RBX: ffff9db385a2bf81 RCX: ffffb5564089bd18 RDX: ffff9db281110100 RSI: 0000000000000fe4 RDI: ffff9db380145400 RBP: ffff9db385a2bf80 R08: ffff9db385a2bfc0 R09: ffff9db385a2bfc2 R10: ffff9db385a6c000 R11: ffff9db385a2bf80 R12: 0000000000000000 R13: 00000000000003e8 R14: ffff9db281110100 R15: ffffffffbb006108 FS: 0000000000000000(0000) GS:ffff9db3bdcc0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005602323024c8 CR3: 0000000022e0c000 CR4: 00000000000006e0 Call Trace: <TASK> ring_buffer_lock_reserve+0x136/0x360 ? __do_softirq+0x287/0x2df ? __pfx_rcu_softirq_qs+0x10/0x10 trace_function+0x21/0x110 ? __pfx_rcu_softirq_qs+0x10/0x10 ? __do_softirq+0x287/0x2df function_trace_call+0xf6/0x120 0xffffffffc038f097 ? rcu_softirq_qs+0x5/0x140 rcu_softirq_qs+0x5/0x140 __do_softirq+0x287/0x2df run_ksoftirqd+0x2a/0x30 smpboot_thread_fn+0x188/0x220 ? __pfx_smpboot_thread_fn+0x10/0x10 kthread+0xe7/0x110 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2c/0x50 </TASK> ---[ end trace 0000000000000000 ]--- [ crash report and test reproducer credit goes to Zheng Yejian] Link: https://lore.kernel.org/linux-trace-kernel/1676376403-16462-1-git-send-email-quic_mojha@quicinc.com Cc: <mhiramat@kernel.org> Cc: stable@vger.kernel.org Fixes: 1039221cc278 ("ring-buffer: Do not disable recording when there is an iterator") Reported-by: Zheng Yejian <zhengyejian1@huawei.com> Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-02-14 20:06:43 +08:00
struct list_head *head = rb_list_head(cpu_buffer->pages);
struct list_head *tmp;
ring-buffer: Handle race between rb_move_tail and rb_check_pages It seems a data race between ring_buffer writing and integrity check. That is, RB_FLAG of head_page is been updating, while at same time RB_FLAG was cleared when doing integrity check rb_check_pages(): rb_check_pages() rb_handle_head_page(): -------- -------- rb_head_page_deactivate() rb_head_page_set_normal() rb_head_page_activate() We do intergrity test of the list to check if the list is corrupted and it is still worth doing it. So, let's refactor rb_check_pages() such that we no longer clear and set flag during the list sanity checking. [1] and [2] are the test to reproduce and the crash report respectively. 1: ``` read_trace.sh while true; do # the "trace" file is closed after read head -1 /sys/kernel/tracing/trace > /dev/null done ``` ``` repro.sh sysctl -w kernel.panic_on_warn=1 # function tracer will writing enough data into ring_buffer echo function > /sys/kernel/tracing/current_tracer ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ``` 2: ------------[ cut here ]------------ WARNING: CPU: 9 PID: 62 at kernel/trace/ring_buffer.c:2653 rb_move_tail+0x450/0x470 Modules linked in: CPU: 9 PID: 62 Comm: ksoftirqd/9 Tainted: G W 6.2.0-rc6+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 RIP: 0010:rb_move_tail+0x450/0x470 Code: ff ff 4c 89 c8 f0 4d 0f b1 02 48 89 c2 48 83 e2 fc 49 39 d0 75 24 83 e0 03 83 f8 02 0f 84 e1 fb ff ff 48 8b 57 10 f0 ff 42 08 <0f> 0b 83 f8 02 0f 84 ce fb ff ff e9 db RSP: 0018:ffffb5564089bd00 EFLAGS: 00000203 RAX: 0000000000000000 RBX: ffff9db385a2bf81 RCX: ffffb5564089bd18 RDX: ffff9db281110100 RSI: 0000000000000fe4 RDI: ffff9db380145400 RBP: ffff9db385a2bf80 R08: ffff9db385a2bfc0 R09: ffff9db385a2bfc2 R10: ffff9db385a6c000 R11: ffff9db385a2bf80 R12: 0000000000000000 R13: 00000000000003e8 R14: ffff9db281110100 R15: ffffffffbb006108 FS: 0000000000000000(0000) GS:ffff9db3bdcc0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005602323024c8 CR3: 0000000022e0c000 CR4: 00000000000006e0 Call Trace: <TASK> ring_buffer_lock_reserve+0x136/0x360 ? __do_softirq+0x287/0x2df ? __pfx_rcu_softirq_qs+0x10/0x10 trace_function+0x21/0x110 ? __pfx_rcu_softirq_qs+0x10/0x10 ? __do_softirq+0x287/0x2df function_trace_call+0xf6/0x120 0xffffffffc038f097 ? rcu_softirq_qs+0x5/0x140 rcu_softirq_qs+0x5/0x140 __do_softirq+0x287/0x2df run_ksoftirqd+0x2a/0x30 smpboot_thread_fn+0x188/0x220 ? __pfx_smpboot_thread_fn+0x10/0x10 kthread+0xe7/0x110 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2c/0x50 </TASK> ---[ end trace 0000000000000000 ]--- [ crash report and test reproducer credit goes to Zheng Yejian] Link: https://lore.kernel.org/linux-trace-kernel/1676376403-16462-1-git-send-email-quic_mojha@quicinc.com Cc: <mhiramat@kernel.org> Cc: stable@vger.kernel.org Fixes: 1039221cc278 ("ring-buffer: Do not disable recording when there is an iterator") Reported-by: Zheng Yejian <zhengyejian1@huawei.com> Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-02-14 20:06:43 +08:00
if (RB_WARN_ON(cpu_buffer,
rb_list_head(rb_list_head(head->next)->prev) != head))
return;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
ring-buffer: Handle race between rb_move_tail and rb_check_pages It seems a data race between ring_buffer writing and integrity check. That is, RB_FLAG of head_page is been updating, while at same time RB_FLAG was cleared when doing integrity check rb_check_pages(): rb_check_pages() rb_handle_head_page(): -------- -------- rb_head_page_deactivate() rb_head_page_set_normal() rb_head_page_activate() We do intergrity test of the list to check if the list is corrupted and it is still worth doing it. So, let's refactor rb_check_pages() such that we no longer clear and set flag during the list sanity checking. [1] and [2] are the test to reproduce and the crash report respectively. 1: ``` read_trace.sh while true; do # the "trace" file is closed after read head -1 /sys/kernel/tracing/trace > /dev/null done ``` ``` repro.sh sysctl -w kernel.panic_on_warn=1 # function tracer will writing enough data into ring_buffer echo function > /sys/kernel/tracing/current_tracer ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ``` 2: ------------[ cut here ]------------ WARNING: CPU: 9 PID: 62 at kernel/trace/ring_buffer.c:2653 rb_move_tail+0x450/0x470 Modules linked in: CPU: 9 PID: 62 Comm: ksoftirqd/9 Tainted: G W 6.2.0-rc6+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 RIP: 0010:rb_move_tail+0x450/0x470 Code: ff ff 4c 89 c8 f0 4d 0f b1 02 48 89 c2 48 83 e2 fc 49 39 d0 75 24 83 e0 03 83 f8 02 0f 84 e1 fb ff ff 48 8b 57 10 f0 ff 42 08 <0f> 0b 83 f8 02 0f 84 ce fb ff ff e9 db RSP: 0018:ffffb5564089bd00 EFLAGS: 00000203 RAX: 0000000000000000 RBX: ffff9db385a2bf81 RCX: ffffb5564089bd18 RDX: ffff9db281110100 RSI: 0000000000000fe4 RDI: ffff9db380145400 RBP: ffff9db385a2bf80 R08: ffff9db385a2bfc0 R09: ffff9db385a2bfc2 R10: ffff9db385a6c000 R11: ffff9db385a2bf80 R12: 0000000000000000 R13: 00000000000003e8 R14: ffff9db281110100 R15: ffffffffbb006108 FS: 0000000000000000(0000) GS:ffff9db3bdcc0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005602323024c8 CR3: 0000000022e0c000 CR4: 00000000000006e0 Call Trace: <TASK> ring_buffer_lock_reserve+0x136/0x360 ? __do_softirq+0x287/0x2df ? __pfx_rcu_softirq_qs+0x10/0x10 trace_function+0x21/0x110 ? __pfx_rcu_softirq_qs+0x10/0x10 ? __do_softirq+0x287/0x2df function_trace_call+0xf6/0x120 0xffffffffc038f097 ? rcu_softirq_qs+0x5/0x140 rcu_softirq_qs+0x5/0x140 __do_softirq+0x287/0x2df run_ksoftirqd+0x2a/0x30 smpboot_thread_fn+0x188/0x220 ? __pfx_smpboot_thread_fn+0x10/0x10 kthread+0xe7/0x110 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2c/0x50 </TASK> ---[ end trace 0000000000000000 ]--- [ crash report and test reproducer credit goes to Zheng Yejian] Link: https://lore.kernel.org/linux-trace-kernel/1676376403-16462-1-git-send-email-quic_mojha@quicinc.com Cc: <mhiramat@kernel.org> Cc: stable@vger.kernel.org Fixes: 1039221cc278 ("ring-buffer: Do not disable recording when there is an iterator") Reported-by: Zheng Yejian <zhengyejian1@huawei.com> Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-02-14 20:06:43 +08:00
if (RB_WARN_ON(cpu_buffer,
rb_list_head(rb_list_head(head->prev)->next) != head))
return;
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
ring-buffer: Handle race between rb_move_tail and rb_check_pages It seems a data race between ring_buffer writing and integrity check. That is, RB_FLAG of head_page is been updating, while at same time RB_FLAG was cleared when doing integrity check rb_check_pages(): rb_check_pages() rb_handle_head_page(): -------- -------- rb_head_page_deactivate() rb_head_page_set_normal() rb_head_page_activate() We do intergrity test of the list to check if the list is corrupted and it is still worth doing it. So, let's refactor rb_check_pages() such that we no longer clear and set flag during the list sanity checking. [1] and [2] are the test to reproduce and the crash report respectively. 1: ``` read_trace.sh while true; do # the "trace" file is closed after read head -1 /sys/kernel/tracing/trace > /dev/null done ``` ``` repro.sh sysctl -w kernel.panic_on_warn=1 # function tracer will writing enough data into ring_buffer echo function > /sys/kernel/tracing/current_tracer ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ``` 2: ------------[ cut here ]------------ WARNING: CPU: 9 PID: 62 at kernel/trace/ring_buffer.c:2653 rb_move_tail+0x450/0x470 Modules linked in: CPU: 9 PID: 62 Comm: ksoftirqd/9 Tainted: G W 6.2.0-rc6+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 RIP: 0010:rb_move_tail+0x450/0x470 Code: ff ff 4c 89 c8 f0 4d 0f b1 02 48 89 c2 48 83 e2 fc 49 39 d0 75 24 83 e0 03 83 f8 02 0f 84 e1 fb ff ff 48 8b 57 10 f0 ff 42 08 <0f> 0b 83 f8 02 0f 84 ce fb ff ff e9 db RSP: 0018:ffffb5564089bd00 EFLAGS: 00000203 RAX: 0000000000000000 RBX: ffff9db385a2bf81 RCX: ffffb5564089bd18 RDX: ffff9db281110100 RSI: 0000000000000fe4 RDI: ffff9db380145400 RBP: ffff9db385a2bf80 R08: ffff9db385a2bfc0 R09: ffff9db385a2bfc2 R10: ffff9db385a6c000 R11: ffff9db385a2bf80 R12: 0000000000000000 R13: 00000000000003e8 R14: ffff9db281110100 R15: ffffffffbb006108 FS: 0000000000000000(0000) GS:ffff9db3bdcc0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005602323024c8 CR3: 0000000022e0c000 CR4: 00000000000006e0 Call Trace: <TASK> ring_buffer_lock_reserve+0x136/0x360 ? __do_softirq+0x287/0x2df ? __pfx_rcu_softirq_qs+0x10/0x10 trace_function+0x21/0x110 ? __pfx_rcu_softirq_qs+0x10/0x10 ? __do_softirq+0x287/0x2df function_trace_call+0xf6/0x120 0xffffffffc038f097 ? rcu_softirq_qs+0x5/0x140 rcu_softirq_qs+0x5/0x140 __do_softirq+0x287/0x2df run_ksoftirqd+0x2a/0x30 smpboot_thread_fn+0x188/0x220 ? __pfx_smpboot_thread_fn+0x10/0x10 kthread+0xe7/0x110 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2c/0x50 </TASK> ---[ end trace 0000000000000000 ]--- [ crash report and test reproducer credit goes to Zheng Yejian] Link: https://lore.kernel.org/linux-trace-kernel/1676376403-16462-1-git-send-email-quic_mojha@quicinc.com Cc: <mhiramat@kernel.org> Cc: stable@vger.kernel.org Fixes: 1039221cc278 ("ring-buffer: Do not disable recording when there is an iterator") Reported-by: Zheng Yejian <zhengyejian1@huawei.com> Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-02-14 20:06:43 +08:00
for (tmp = rb_list_head(head->next); tmp != head; tmp = rb_list_head(tmp->next)) {
if (RB_WARN_ON(cpu_buffer,
ring-buffer: Handle race between rb_move_tail and rb_check_pages It seems a data race between ring_buffer writing and integrity check. That is, RB_FLAG of head_page is been updating, while at same time RB_FLAG was cleared when doing integrity check rb_check_pages(): rb_check_pages() rb_handle_head_page(): -------- -------- rb_head_page_deactivate() rb_head_page_set_normal() rb_head_page_activate() We do intergrity test of the list to check if the list is corrupted and it is still worth doing it. So, let's refactor rb_check_pages() such that we no longer clear and set flag during the list sanity checking. [1] and [2] are the test to reproduce and the crash report respectively. 1: ``` read_trace.sh while true; do # the "trace" file is closed after read head -1 /sys/kernel/tracing/trace > /dev/null done ``` ``` repro.sh sysctl -w kernel.panic_on_warn=1 # function tracer will writing enough data into ring_buffer echo function > /sys/kernel/tracing/current_tracer ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ``` 2: ------------[ cut here ]------------ WARNING: CPU: 9 PID: 62 at kernel/trace/ring_buffer.c:2653 rb_move_tail+0x450/0x470 Modules linked in: CPU: 9 PID: 62 Comm: ksoftirqd/9 Tainted: G W 6.2.0-rc6+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 RIP: 0010:rb_move_tail+0x450/0x470 Code: ff ff 4c 89 c8 f0 4d 0f b1 02 48 89 c2 48 83 e2 fc 49 39 d0 75 24 83 e0 03 83 f8 02 0f 84 e1 fb ff ff 48 8b 57 10 f0 ff 42 08 <0f> 0b 83 f8 02 0f 84 ce fb ff ff e9 db RSP: 0018:ffffb5564089bd00 EFLAGS: 00000203 RAX: 0000000000000000 RBX: ffff9db385a2bf81 RCX: ffffb5564089bd18 RDX: ffff9db281110100 RSI: 0000000000000fe4 RDI: ffff9db380145400 RBP: ffff9db385a2bf80 R08: ffff9db385a2bfc0 R09: ffff9db385a2bfc2 R10: ffff9db385a6c000 R11: ffff9db385a2bf80 R12: 0000000000000000 R13: 00000000000003e8 R14: ffff9db281110100 R15: ffffffffbb006108 FS: 0000000000000000(0000) GS:ffff9db3bdcc0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005602323024c8 CR3: 0000000022e0c000 CR4: 00000000000006e0 Call Trace: <TASK> ring_buffer_lock_reserve+0x136/0x360 ? __do_softirq+0x287/0x2df ? __pfx_rcu_softirq_qs+0x10/0x10 trace_function+0x21/0x110 ? __pfx_rcu_softirq_qs+0x10/0x10 ? __do_softirq+0x287/0x2df function_trace_call+0xf6/0x120 0xffffffffc038f097 ? rcu_softirq_qs+0x5/0x140 rcu_softirq_qs+0x5/0x140 __do_softirq+0x287/0x2df run_ksoftirqd+0x2a/0x30 smpboot_thread_fn+0x188/0x220 ? __pfx_smpboot_thread_fn+0x10/0x10 kthread+0xe7/0x110 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2c/0x50 </TASK> ---[ end trace 0000000000000000 ]--- [ crash report and test reproducer credit goes to Zheng Yejian] Link: https://lore.kernel.org/linux-trace-kernel/1676376403-16462-1-git-send-email-quic_mojha@quicinc.com Cc: <mhiramat@kernel.org> Cc: stable@vger.kernel.org Fixes: 1039221cc278 ("ring-buffer: Do not disable recording when there is an iterator") Reported-by: Zheng Yejian <zhengyejian1@huawei.com> Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-02-14 20:06:43 +08:00
rb_list_head(rb_list_head(tmp->next)->prev) != tmp))
return;
ring-buffer: Handle race between rb_move_tail and rb_check_pages It seems a data race between ring_buffer writing and integrity check. That is, RB_FLAG of head_page is been updating, while at same time RB_FLAG was cleared when doing integrity check rb_check_pages(): rb_check_pages() rb_handle_head_page(): -------- -------- rb_head_page_deactivate() rb_head_page_set_normal() rb_head_page_activate() We do intergrity test of the list to check if the list is corrupted and it is still worth doing it. So, let's refactor rb_check_pages() such that we no longer clear and set flag during the list sanity checking. [1] and [2] are the test to reproduce and the crash report respectively. 1: ``` read_trace.sh while true; do # the "trace" file is closed after read head -1 /sys/kernel/tracing/trace > /dev/null done ``` ``` repro.sh sysctl -w kernel.panic_on_warn=1 # function tracer will writing enough data into ring_buffer echo function > /sys/kernel/tracing/current_tracer ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ``` 2: ------------[ cut here ]------------ WARNING: CPU: 9 PID: 62 at kernel/trace/ring_buffer.c:2653 rb_move_tail+0x450/0x470 Modules linked in: CPU: 9 PID: 62 Comm: ksoftirqd/9 Tainted: G W 6.2.0-rc6+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 RIP: 0010:rb_move_tail+0x450/0x470 Code: ff ff 4c 89 c8 f0 4d 0f b1 02 48 89 c2 48 83 e2 fc 49 39 d0 75 24 83 e0 03 83 f8 02 0f 84 e1 fb ff ff 48 8b 57 10 f0 ff 42 08 <0f> 0b 83 f8 02 0f 84 ce fb ff ff e9 db RSP: 0018:ffffb5564089bd00 EFLAGS: 00000203 RAX: 0000000000000000 RBX: ffff9db385a2bf81 RCX: ffffb5564089bd18 RDX: ffff9db281110100 RSI: 0000000000000fe4 RDI: ffff9db380145400 RBP: ffff9db385a2bf80 R08: ffff9db385a2bfc0 R09: ffff9db385a2bfc2 R10: ffff9db385a6c000 R11: ffff9db385a2bf80 R12: 0000000000000000 R13: 00000000000003e8 R14: ffff9db281110100 R15: ffffffffbb006108 FS: 0000000000000000(0000) GS:ffff9db3bdcc0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005602323024c8 CR3: 0000000022e0c000 CR4: 00000000000006e0 Call Trace: <TASK> ring_buffer_lock_reserve+0x136/0x360 ? __do_softirq+0x287/0x2df ? __pfx_rcu_softirq_qs+0x10/0x10 trace_function+0x21/0x110 ? __pfx_rcu_softirq_qs+0x10/0x10 ? __do_softirq+0x287/0x2df function_trace_call+0xf6/0x120 0xffffffffc038f097 ? rcu_softirq_qs+0x5/0x140 rcu_softirq_qs+0x5/0x140 __do_softirq+0x287/0x2df run_ksoftirqd+0x2a/0x30 smpboot_thread_fn+0x188/0x220 ? __pfx_smpboot_thread_fn+0x10/0x10 kthread+0xe7/0x110 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2c/0x50 </TASK> ---[ end trace 0000000000000000 ]--- [ crash report and test reproducer credit goes to Zheng Yejian] Link: https://lore.kernel.org/linux-trace-kernel/1676376403-16462-1-git-send-email-quic_mojha@quicinc.com Cc: <mhiramat@kernel.org> Cc: stable@vger.kernel.org Fixes: 1039221cc278 ("ring-buffer: Do not disable recording when there is an iterator") Reported-by: Zheng Yejian <zhengyejian1@huawei.com> Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-02-14 20:06:43 +08:00
if (RB_WARN_ON(cpu_buffer,
ring-buffer: Handle race between rb_move_tail and rb_check_pages It seems a data race between ring_buffer writing and integrity check. That is, RB_FLAG of head_page is been updating, while at same time RB_FLAG was cleared when doing integrity check rb_check_pages(): rb_check_pages() rb_handle_head_page(): -------- -------- rb_head_page_deactivate() rb_head_page_set_normal() rb_head_page_activate() We do intergrity test of the list to check if the list is corrupted and it is still worth doing it. So, let's refactor rb_check_pages() such that we no longer clear and set flag during the list sanity checking. [1] and [2] are the test to reproduce and the crash report respectively. 1: ``` read_trace.sh while true; do # the "trace" file is closed after read head -1 /sys/kernel/tracing/trace > /dev/null done ``` ``` repro.sh sysctl -w kernel.panic_on_warn=1 # function tracer will writing enough data into ring_buffer echo function > /sys/kernel/tracing/current_tracer ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ./read_trace.sh & ``` 2: ------------[ cut here ]------------ WARNING: CPU: 9 PID: 62 at kernel/trace/ring_buffer.c:2653 rb_move_tail+0x450/0x470 Modules linked in: CPU: 9 PID: 62 Comm: ksoftirqd/9 Tainted: G W 6.2.0-rc6+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 RIP: 0010:rb_move_tail+0x450/0x470 Code: ff ff 4c 89 c8 f0 4d 0f b1 02 48 89 c2 48 83 e2 fc 49 39 d0 75 24 83 e0 03 83 f8 02 0f 84 e1 fb ff ff 48 8b 57 10 f0 ff 42 08 <0f> 0b 83 f8 02 0f 84 ce fb ff ff e9 db RSP: 0018:ffffb5564089bd00 EFLAGS: 00000203 RAX: 0000000000000000 RBX: ffff9db385a2bf81 RCX: ffffb5564089bd18 RDX: ffff9db281110100 RSI: 0000000000000fe4 RDI: ffff9db380145400 RBP: ffff9db385a2bf80 R08: ffff9db385a2bfc0 R09: ffff9db385a2bfc2 R10: ffff9db385a6c000 R11: ffff9db385a2bf80 R12: 0000000000000000 R13: 00000000000003e8 R14: ffff9db281110100 R15: ffffffffbb006108 FS: 0000000000000000(0000) GS:ffff9db3bdcc0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005602323024c8 CR3: 0000000022e0c000 CR4: 00000000000006e0 Call Trace: <TASK> ring_buffer_lock_reserve+0x136/0x360 ? __do_softirq+0x287/0x2df ? __pfx_rcu_softirq_qs+0x10/0x10 trace_function+0x21/0x110 ? __pfx_rcu_softirq_qs+0x10/0x10 ? __do_softirq+0x287/0x2df function_trace_call+0xf6/0x120 0xffffffffc038f097 ? rcu_softirq_qs+0x5/0x140 rcu_softirq_qs+0x5/0x140 __do_softirq+0x287/0x2df run_ksoftirqd+0x2a/0x30 smpboot_thread_fn+0x188/0x220 ? __pfx_smpboot_thread_fn+0x10/0x10 kthread+0xe7/0x110 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2c/0x50 </TASK> ---[ end trace 0000000000000000 ]--- [ crash report and test reproducer credit goes to Zheng Yejian] Link: https://lore.kernel.org/linux-trace-kernel/1676376403-16462-1-git-send-email-quic_mojha@quicinc.com Cc: <mhiramat@kernel.org> Cc: stable@vger.kernel.org Fixes: 1039221cc278 ("ring-buffer: Do not disable recording when there is an iterator") Reported-by: Zheng Yejian <zhengyejian1@huawei.com> Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-02-14 20:06:43 +08:00
rb_list_head(rb_list_head(tmp->prev)->next) != tmp))
return;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
}
static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
long nr_pages, struct list_head *pages)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
struct buffer_page *bpage, *tmp;
bool user_thread = current->mm != NULL;
gfp_t mflags;
ring-buffer: Use long for nr_pages to avoid overflow failures The size variable to change the ring buffer in ftrace is a long. The nr_pages used to update the ring buffer based on the size is int. On 64 bit machines this can cause an overflow problem. For example, the following will cause the ring buffer to crash: # cd /sys/kernel/debug/tracing # echo 10 > buffer_size_kb # echo 8556384240 > buffer_size_kb Then you get the warning of: WARNING: CPU: 1 PID: 318 at kernel/trace/ring_buffer.c:1527 rb_update_pages+0x22f/0x260 Which is: RB_WARN_ON(cpu_buffer, nr_removed); Note each ring buffer page holds 4080 bytes. This is because: 1) 10 causes the ring buffer to have 3 pages. (10kb requires 3 * 4080 pages to hold) 2) (2^31 / 2^10 + 1) * 4080 = 8556384240 The value written into buffer_size_kb is shifted by 10 and then passed to ring_buffer_resize(). 8556384240 * 2^10 = 8761737461760 3) The size passed to ring_buffer_resize() is then divided by BUF_PAGE_SIZE which is 4080. 8761737461760 / 4080 = 2147484672 4) nr_pages is subtracted from the current nr_pages (3) and we get: 2147484669. This value is saved in a signed integer nr_pages_to_update 5) 2147484669 is greater than 2^31 but smaller than 2^32, a signed int turns into the value of -2147482627 6) As the value is a negative number, in update_pages_handler() it is negated and passed to rb_remove_pages() and 2147482627 pages will be removed, which is much larger than 3 and it causes the warning because not all the pages asked to be removed were removed. Link: https://bugzilla.kernel.org/show_bug.cgi?id=118001 Cc: stable@vger.kernel.org # 2.6.28+ Fixes: 7a8e76a3829f1 ("tracing: unified trace buffer") Reported-by: Hao Qin <QEver.cn@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2016-05-12 23:01:24 +08:00
long i;
/*
* Check if the available memory is there first.
* Note, si_mem_available() only gives us a rough estimate of available
* memory. It may not be accurate. But we don't care, we just want
* to prevent doing any allocation when it is obvious that it is
* not going to succeed.
*/
ring-buffer: Check if memory is available before allocation The ring buffer is made up of a link list of pages. When making the ring buffer bigger, it will allocate all the pages it needs before adding to the ring buffer, and if it fails, it frees them and returns an error. This makes increasing the ring buffer size an all or nothing action. When this was first created, the pages were allocated with "NORETRY". This was to not cause any Out-Of-Memory (OOM) actions from allocating the ring buffer. But NORETRY was too strict, as the ring buffer would fail to expand even when there's memory available, but was taken up in the page cache. Commit 848618857d253 ("tracing/ring_buffer: Try harder to allocate") changed the allocating from NORETRY to RETRY_MAYFAIL. The RETRY_MAYFAIL would allocate from the page cache, but if there was no memory available, it would simple fail the allocation and not trigger an OOM. This worked fine, but had one problem. As the ring buffer would allocate one page at a time, it could take up all memory in the system before it failed to allocate and free that memory. If the allocation is happening and the ring buffer allocates all memory and then tries to take more than available, its allocation will not trigger an OOM, but if there's any allocation that happens someplace else, that could trigger an OOM, even though once the ring buffer's allocation fails, it would free up all the previous memory it tried to allocate, and allow other memory allocations to succeed. Commit d02bd27bd33dd ("mm/page_alloc.c: calculate 'available' memory in a separate function") separated out si_mem_availble() as a separate function that could be used to see how much memory is available in the system. Using this function to make sure that the ring buffer could be allocated before it tries to allocate pages we can avoid allocating all memory in the system and making it vulnerable to OOMs if other allocations are taking place. Link: http://lkml.kernel.org/r/1522320104-6573-1-git-send-email-zhaoyang.huang@spreadtrum.com CC: stable@vger.kernel.org Cc: linux-mm@kvack.org Fixes: 848618857d253 ("tracing/ring_buffer: Try harder to allocate") Requires: d02bd27bd33dd ("mm/page_alloc.c: calculate 'available' memory in a separate function") Reported-by: Zhaoyang Huang <huangzhaoyang@gmail.com> Tested-by: Joel Fernandes <joelaf@google.com> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2018-04-02 22:33:56 +08:00
i = si_mem_available();
if (i < nr_pages)
return -ENOMEM;
/*
* __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails
* gracefully without invoking oom-killer and the system is not
* destabilized.
*/
mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
/*
* If a user thread allocates too much, and si_mem_available()
* reports there's enough memory, even though there is not.
* Make sure the OOM killer kills this thread. This can happen
* even with RETRY_MAYFAIL because another task may be doing
* an allocation after this task has taken all memory.
* This is the task the OOM killer needs to take out during this
* loop, even if it was triggered by an allocation somewhere else.
*/
if (user_thread)
set_current_oom_origin();
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
for (i = 0; i < nr_pages; i++) {
tracing: Use NUMA allocation for per-cpu ring buffer pages The tracing ring buffer is a group of per-cpu ring buffers where allocation and logging is done on a per-cpu basis. The events that are generated on a particular CPU are logged in the corresponding buffer. This is to provide wait-free writes between CPUs and good NUMA node locality while accessing the ring buffer. However, the allocation routines consider NUMA locality only for buffer page metadata and not for the actual buffer page. This causes the pages to be allocated on the NUMA node local to the CPU where the allocation routine is running at the time. This patch fixes the problem by using a NUMA node specific allocation routine so that the pages are allocated from a NUMA node local to the logging CPU. I tested with the getuid_microbench from autotest. It is a simple binary that calls getuid() in a loop and measures the average time for the syscall to complete. The following command was used to test: $ getuid_microbench 1000000 Compared the numbers found on kernel with and without this patch and found that logging latency decreases by 30-50 ns/call. tracing with non-NUMA allocation - 569 ns/call tracing with NUMA allocation - 512 ns/call Signed-off-by: Vaibhav Nagarnaik <vnagarnaik@google.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Michael Rubin <mrubin@google.com> Cc: David Sharp <dhsharp@google.com> Link: http://lkml.kernel.org/r/1304470602-20366-1-git-send-email-vnagarnaik@google.com Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2011-05-04 08:56:42 +08:00
struct page *page;
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
mflags, cpu_to_node(cpu_buffer->cpu));
if (!bpage)
goto free_pages;
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
rb_check_bpage(cpu_buffer, bpage);
list_add(&bpage->list, pages);
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, 0);
tracing: Use NUMA allocation for per-cpu ring buffer pages The tracing ring buffer is a group of per-cpu ring buffers where allocation and logging is done on a per-cpu basis. The events that are generated on a particular CPU are logged in the corresponding buffer. This is to provide wait-free writes between CPUs and good NUMA node locality while accessing the ring buffer. However, the allocation routines consider NUMA locality only for buffer page metadata and not for the actual buffer page. This causes the pages to be allocated on the NUMA node local to the CPU where the allocation routine is running at the time. This patch fixes the problem by using a NUMA node specific allocation routine so that the pages are allocated from a NUMA node local to the logging CPU. I tested with the getuid_microbench from autotest. It is a simple binary that calls getuid() in a loop and measures the average time for the syscall to complete. The following command was used to test: $ getuid_microbench 1000000 Compared the numbers found on kernel with and without this patch and found that logging latency decreases by 30-50 ns/call. tracing with non-NUMA allocation - 569 ns/call tracing with NUMA allocation - 512 ns/call Signed-off-by: Vaibhav Nagarnaik <vnagarnaik@google.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Michael Rubin <mrubin@google.com> Cc: David Sharp <dhsharp@google.com> Link: http://lkml.kernel.org/r/1304470602-20366-1-git-send-email-vnagarnaik@google.com Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2011-05-04 08:56:42 +08:00
if (!page)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
goto free_pages;
tracing: Use NUMA allocation for per-cpu ring buffer pages The tracing ring buffer is a group of per-cpu ring buffers where allocation and logging is done on a per-cpu basis. The events that are generated on a particular CPU are logged in the corresponding buffer. This is to provide wait-free writes between CPUs and good NUMA node locality while accessing the ring buffer. However, the allocation routines consider NUMA locality only for buffer page metadata and not for the actual buffer page. This causes the pages to be allocated on the NUMA node local to the CPU where the allocation routine is running at the time. This patch fixes the problem by using a NUMA node specific allocation routine so that the pages are allocated from a NUMA node local to the logging CPU. I tested with the getuid_microbench from autotest. It is a simple binary that calls getuid() in a loop and measures the average time for the syscall to complete. The following command was used to test: $ getuid_microbench 1000000 Compared the numbers found on kernel with and without this patch and found that logging latency decreases by 30-50 ns/call. tracing with non-NUMA allocation - 569 ns/call tracing with NUMA allocation - 512 ns/call Signed-off-by: Vaibhav Nagarnaik <vnagarnaik@google.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Michael Rubin <mrubin@google.com> Cc: David Sharp <dhsharp@google.com> Link: http://lkml.kernel.org/r/1304470602-20366-1-git-send-email-vnagarnaik@google.com Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2011-05-04 08:56:42 +08:00
bpage->page = page_address(page);
rb_init_page(bpage->page);
if (user_thread && fatal_signal_pending(current))
goto free_pages;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
if (user_thread)
clear_current_oom_origin();
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
return 0;
free_pages:
list_for_each_entry_safe(bpage, tmp, pages, list) {
list_del_init(&bpage->list);
free_buffer_page(bpage);
}
if (user_thread)
clear_current_oom_origin();
return -ENOMEM;
}
static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
ring-buffer: Use long for nr_pages to avoid overflow failures The size variable to change the ring buffer in ftrace is a long. The nr_pages used to update the ring buffer based on the size is int. On 64 bit machines this can cause an overflow problem. For example, the following will cause the ring buffer to crash: # cd /sys/kernel/debug/tracing # echo 10 > buffer_size_kb # echo 8556384240 > buffer_size_kb Then you get the warning of: WARNING: CPU: 1 PID: 318 at kernel/trace/ring_buffer.c:1527 rb_update_pages+0x22f/0x260 Which is: RB_WARN_ON(cpu_buffer, nr_removed); Note each ring buffer page holds 4080 bytes. This is because: 1) 10 causes the ring buffer to have 3 pages. (10kb requires 3 * 4080 pages to hold) 2) (2^31 / 2^10 + 1) * 4080 = 8556384240 The value written into buffer_size_kb is shifted by 10 and then passed to ring_buffer_resize(). 8556384240 * 2^10 = 8761737461760 3) The size passed to ring_buffer_resize() is then divided by BUF_PAGE_SIZE which is 4080. 8761737461760 / 4080 = 2147484672 4) nr_pages is subtracted from the current nr_pages (3) and we get: 2147484669. This value is saved in a signed integer nr_pages_to_update 5) 2147484669 is greater than 2^31 but smaller than 2^32, a signed int turns into the value of -2147482627 6) As the value is a negative number, in update_pages_handler() it is negated and passed to rb_remove_pages() and 2147482627 pages will be removed, which is much larger than 3 and it causes the warning because not all the pages asked to be removed were removed. Link: https://bugzilla.kernel.org/show_bug.cgi?id=118001 Cc: stable@vger.kernel.org # 2.6.28+ Fixes: 7a8e76a3829f1 ("tracing: unified trace buffer") Reported-by: Hao Qin <QEver.cn@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2016-05-12 23:01:24 +08:00
unsigned long nr_pages)
{
LIST_HEAD(pages);
WARN_ON(!nr_pages);
if (__rb_allocate_pages(cpu_buffer, nr_pages, &pages))
return -ENOMEM;
/*
* The ring buffer page list is a circular list that does not
* start and end with a list head. All page list items point to
* other pages.
*/
cpu_buffer->pages = pages.next;
list_del(&pages);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
cpu_buffer->nr_pages = nr_pages;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
rb_check_pages(cpu_buffer);
return 0;
}
static struct ring_buffer_per_cpu *
rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
struct ring_buffer_per_cpu *cpu_buffer;
struct buffer_page *bpage;
tracing: Use NUMA allocation for per-cpu ring buffer pages The tracing ring buffer is a group of per-cpu ring buffers where allocation and logging is done on a per-cpu basis. The events that are generated on a particular CPU are logged in the corresponding buffer. This is to provide wait-free writes between CPUs and good NUMA node locality while accessing the ring buffer. However, the allocation routines consider NUMA locality only for buffer page metadata and not for the actual buffer page. This causes the pages to be allocated on the NUMA node local to the CPU where the allocation routine is running at the time. This patch fixes the problem by using a NUMA node specific allocation routine so that the pages are allocated from a NUMA node local to the logging CPU. I tested with the getuid_microbench from autotest. It is a simple binary that calls getuid() in a loop and measures the average time for the syscall to complete. The following command was used to test: $ getuid_microbench 1000000 Compared the numbers found on kernel with and without this patch and found that logging latency decreases by 30-50 ns/call. tracing with non-NUMA allocation - 569 ns/call tracing with NUMA allocation - 512 ns/call Signed-off-by: Vaibhav Nagarnaik <vnagarnaik@google.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Michael Rubin <mrubin@google.com> Cc: David Sharp <dhsharp@google.com> Link: http://lkml.kernel.org/r/1304470602-20366-1-git-send-email-vnagarnaik@google.com Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2011-05-04 08:56:42 +08:00
struct page *page;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
int ret;
cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
GFP_KERNEL, cpu_to_node(cpu));
if (!cpu_buffer)
return NULL;
cpu_buffer->cpu = cpu;
cpu_buffer->buffer = buffer;
raw_spin_lock_init(&cpu_buffer->reader_lock);
ring-buffer: pass in lockdep class key for reader_lock On Sun, 7 Jun 2009, Ingo Molnar wrote: > Testing tracer sched_switch: <6>Starting ring buffer hammer > PASSED > Testing tracer sysprof: PASSED > Testing tracer function: PASSED > Testing tracer irqsoff: > ============================================= > PASSED > Testing tracer preemptoff: PASSED > Testing tracer preemptirqsoff: [ INFO: possible recursive locking detected ] > PASSED > Testing tracer branch: 2.6.30-rc8-tip-01972-ge5b9078-dirty #5760 > --------------------------------------------- > rb_consumer/431 is trying to acquire lock: > (&cpu_buffer->reader_lock){......}, at: [<c109eef7>] ring_buffer_reset_cpu+0x37/0x70 > > but task is already holding lock: > (&cpu_buffer->reader_lock){......}, at: [<c10a019e>] ring_buffer_consume+0x7e/0xc0 > > other info that might help us debug this: > 1 lock held by rb_consumer/431: > #0: (&cpu_buffer->reader_lock){......}, at: [<c10a019e>] ring_buffer_consume+0x7e/0xc0 The ring buffer is a generic structure, and can be used outside of ftrace. If ftrace traces within the use of the ring buffer, it can produce false positives with lockdep. This patch passes in a static lock key into the allocation of the ring buffer, so that different ring buffers will have their own lock class. Reported-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <1244477919.13761.9042.camel@twins> [ store key in ring buffer descriptor ] Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-06-09 00:18:39 +08:00
lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler);
init_completion(&cpu_buffer->update_done);
init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
init_waitqueue_head(&cpu_buffer->irq_work.waiters);
ring-buffer: Do not wake up a splice waiter when page is not full When an application connects to the ring buffer via splice, it can only read full pages. Splice does not work with partial pages. If there is not enough data to fill a page, the splice command will either block or return -EAGAIN (if set to nonblock). Code was added where if the page is not full, to just sleep again. The problem is, it will get woken up again on the next event. That is, when something is written into the ring buffer, if there is a waiter it will wake it up. The waiter would then check the buffer, see that it still does not have enough data to fill a page and go back to sleep. To make matters worse, when the waiter goes back to sleep, it could cause another event, which would wake it back up again to see it doesn't have enough data and sleep again. This produces a tremendous overhead and fills the ring buffer with noise. For example, recording sched_switch on an idle system for 10 seconds produces 25,350,475 events!!! Create another wait queue for those waiters wanting full pages. When an event is written, it only wakes up waiters if there's a full page of data. It does not wake up the waiter if the page is not yet full. After this change, recording sched_switch on an idle system for 10 seconds produces only 800 events. Getting rid of 25,349,675 useless events (99.9969% of events!!), is something to take seriously. Cc: stable@vger.kernel.org # 3.16+ Cc: Rabin Vincent <rabin@rab.in> Fixes: e30f53aad220 "tracing: Do not busy wait in buffer splice" Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-02-11 11:14:53 +08:00
init_waitqueue_head(&cpu_buffer->irq_work.full_waiters);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
GFP_KERNEL, cpu_to_node(cpu));
if (!bpage)
goto fail_free_buffer;
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
rb_check_bpage(cpu_buffer, bpage);
cpu_buffer->reader_page = bpage;
tracing: Use NUMA allocation for per-cpu ring buffer pages The tracing ring buffer is a group of per-cpu ring buffers where allocation and logging is done on a per-cpu basis. The events that are generated on a particular CPU are logged in the corresponding buffer. This is to provide wait-free writes between CPUs and good NUMA node locality while accessing the ring buffer. However, the allocation routines consider NUMA locality only for buffer page metadata and not for the actual buffer page. This causes the pages to be allocated on the NUMA node local to the CPU where the allocation routine is running at the time. This patch fixes the problem by using a NUMA node specific allocation routine so that the pages are allocated from a NUMA node local to the logging CPU. I tested with the getuid_microbench from autotest. It is a simple binary that calls getuid() in a loop and measures the average time for the syscall to complete. The following command was used to test: $ getuid_microbench 1000000 Compared the numbers found on kernel with and without this patch and found that logging latency decreases by 30-50 ns/call. tracing with non-NUMA allocation - 569 ns/call tracing with NUMA allocation - 512 ns/call Signed-off-by: Vaibhav Nagarnaik <vnagarnaik@google.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Michael Rubin <mrubin@google.com> Cc: David Sharp <dhsharp@google.com> Link: http://lkml.kernel.org/r/1304470602-20366-1-git-send-email-vnagarnaik@google.com Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2011-05-04 08:56:42 +08:00
page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0);
if (!page)
goto fail_free_reader;
tracing: Use NUMA allocation for per-cpu ring buffer pages The tracing ring buffer is a group of per-cpu ring buffers where allocation and logging is done on a per-cpu basis. The events that are generated on a particular CPU are logged in the corresponding buffer. This is to provide wait-free writes between CPUs and good NUMA node locality while accessing the ring buffer. However, the allocation routines consider NUMA locality only for buffer page metadata and not for the actual buffer page. This causes the pages to be allocated on the NUMA node local to the CPU where the allocation routine is running at the time. This patch fixes the problem by using a NUMA node specific allocation routine so that the pages are allocated from a NUMA node local to the logging CPU. I tested with the getuid_microbench from autotest. It is a simple binary that calls getuid() in a loop and measures the average time for the syscall to complete. The following command was used to test: $ getuid_microbench 1000000 Compared the numbers found on kernel with and without this patch and found that logging latency decreases by 30-50 ns/call. tracing with non-NUMA allocation - 569 ns/call tracing with NUMA allocation - 512 ns/call Signed-off-by: Vaibhav Nagarnaik <vnagarnaik@google.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Michael Rubin <mrubin@google.com> Cc: David Sharp <dhsharp@google.com> Link: http://lkml.kernel.org/r/1304470602-20366-1-git-send-email-vnagarnaik@google.com Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2011-05-04 08:56:42 +08:00
bpage->page = page_address(page);
rb_init_page(bpage->page);
INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
INIT_LIST_HEAD(&cpu_buffer->new_pages);
ret = rb_allocate_pages(cpu_buffer, nr_pages);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
if (ret < 0)
goto fail_free_reader;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
cpu_buffer->head_page
= list_entry(cpu_buffer->pages, struct buffer_page, list);
cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
rb_head_page_activate(cpu_buffer);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
return cpu_buffer;
fail_free_reader:
free_buffer_page(cpu_buffer->reader_page);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
fail_free_buffer:
kfree(cpu_buffer);
return NULL;
}
static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
{
struct list_head *head = cpu_buffer->pages;
struct buffer_page *bpage, *tmp;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
ring-buffer: Sync IRQ works before buffer destruction If something was written to the buffer just before destruction, it may be possible (maybe not in a real system, but it did happen in ARCH=um with time-travel) to destroy the ringbuffer before the IRQ work ran, leading this KASAN report (or a crash without KASAN): BUG: KASAN: slab-use-after-free in irq_work_run_list+0x11a/0x13a Read of size 8 at addr 000000006d640a48 by task swapper/0 CPU: 0 PID: 0 Comm: swapper Tainted: G W O 6.3.0-rc1 #7 Stack: 60c4f20f 0c203d48 41b58ab3 60f224fc 600477fa 60f35687 60c4f20f 601273dd 00000008 6101eb00 6101eab0 615be548 Call Trace: [<60047a58>] show_stack+0x25e/0x282 [<60c609e0>] dump_stack_lvl+0x96/0xfd [<60c50d4c>] print_report+0x1a7/0x5a8 [<603078d3>] kasan_report+0xc1/0xe9 [<60308950>] __asan_report_load8_noabort+0x1b/0x1d [<60232844>] irq_work_run_list+0x11a/0x13a [<602328b4>] irq_work_tick+0x24/0x34 [<6017f9dc>] update_process_times+0x162/0x196 [<6019f335>] tick_sched_handle+0x1a4/0x1c3 [<6019fd9e>] tick_sched_timer+0x79/0x10c [<601812b9>] __hrtimer_run_queues.constprop.0+0x425/0x695 [<60182913>] hrtimer_interrupt+0x16c/0x2c4 [<600486a3>] um_timer+0x164/0x183 [...] Allocated by task 411: save_stack_trace+0x99/0xb5 stack_trace_save+0x81/0x9b kasan_save_stack+0x2d/0x54 kasan_set_track+0x34/0x3e kasan_save_alloc_info+0x25/0x28 ____kasan_kmalloc+0x8b/0x97 __kasan_kmalloc+0x10/0x12 __kmalloc+0xb2/0xe8 load_elf_phdrs+0xee/0x182 [...] The buggy address belongs to the object at 000000006d640800 which belongs to the cache kmalloc-1k of size 1024 The buggy address is located 584 bytes inside of freed 1024-byte region [000000006d640800, 000000006d640c00) Add the appropriate irq_work_sync() so the work finishes before the buffers are destroyed. Prior to the commit in the Fixes tag below, there was only a single global IRQ work, so this issue didn't exist. Link: https://lore.kernel.org/linux-trace-kernel/20230427175920.a76159263122.I8295e405c44362a86c995e9c2c37e3e03810aa56@changeid Cc: stable@vger.kernel.org Cc: Masami Hiramatsu <mhiramat@kernel.org> Fixes: 15693458c4bc ("tracing/ring-buffer: Move poll wake ups into ring buffer code") Signed-off-by: Johannes Berg <johannes.berg@intel.com> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-04-27 23:59:20 +08:00
irq_work_sync(&cpu_buffer->irq_work.work);
free_buffer_page(cpu_buffer->reader_page);
if (head) {
rb_head_page_deactivate(cpu_buffer);
list_for_each_entry_safe(bpage, tmp, head, list) {
list_del_init(&bpage->list);
free_buffer_page(bpage);
}
bpage = list_entry(head, struct buffer_page, list);
free_buffer_page(bpage);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
free_page((unsigned long)cpu_buffer->free_page);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
kfree(cpu_buffer);
}
/**
* __ring_buffer_alloc - allocate a new ring_buffer
* @size: the size in bytes per cpu that is needed.
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
* @flags: attributes to set for the ring buffer.
* @key: ring buffer reader_lock_key.
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
*
* Currently the only flag that is available is the RB_FL_OVERWRITE
* flag. This flag means that the buffer will overwrite old data
* when the buffer wraps. If this flag is not set, the buffer will
* drop data when the tail hits the head.
*/
struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
ring-buffer: pass in lockdep class key for reader_lock On Sun, 7 Jun 2009, Ingo Molnar wrote: > Testing tracer sched_switch: <6>Starting ring buffer hammer > PASSED > Testing tracer sysprof: PASSED > Testing tracer function: PASSED > Testing tracer irqsoff: > ============================================= > PASSED > Testing tracer preemptoff: PASSED > Testing tracer preemptirqsoff: [ INFO: possible recursive locking detected ] > PASSED > Testing tracer branch: 2.6.30-rc8-tip-01972-ge5b9078-dirty #5760 > --------------------------------------------- > rb_consumer/431 is trying to acquire lock: > (&cpu_buffer->reader_lock){......}, at: [<c109eef7>] ring_buffer_reset_cpu+0x37/0x70 > > but task is already holding lock: > (&cpu_buffer->reader_lock){......}, at: [<c10a019e>] ring_buffer_consume+0x7e/0xc0 > > other info that might help us debug this: > 1 lock held by rb_consumer/431: > #0: (&cpu_buffer->reader_lock){......}, at: [<c10a019e>] ring_buffer_consume+0x7e/0xc0 The ring buffer is a generic structure, and can be used outside of ftrace. If ftrace traces within the use of the ring buffer, it can produce false positives with lockdep. This patch passes in a static lock key into the allocation of the ring buffer, so that different ring buffers will have their own lock class. Reported-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <1244477919.13761.9042.camel@twins> [ store key in ring buffer descriptor ] Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-06-09 00:18:39 +08:00
struct lock_class_key *key)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
struct trace_buffer *buffer;
ring-buffer: Use long for nr_pages to avoid overflow failures The size variable to change the ring buffer in ftrace is a long. The nr_pages used to update the ring buffer based on the size is int. On 64 bit machines this can cause an overflow problem. For example, the following will cause the ring buffer to crash: # cd /sys/kernel/debug/tracing # echo 10 > buffer_size_kb # echo 8556384240 > buffer_size_kb Then you get the warning of: WARNING: CPU: 1 PID: 318 at kernel/trace/ring_buffer.c:1527 rb_update_pages+0x22f/0x260 Which is: RB_WARN_ON(cpu_buffer, nr_removed); Note each ring buffer page holds 4080 bytes. This is because: 1) 10 causes the ring buffer to have 3 pages. (10kb requires 3 * 4080 pages to hold) 2) (2^31 / 2^10 + 1) * 4080 = 8556384240 The value written into buffer_size_kb is shifted by 10 and then passed to ring_buffer_resize(). 8556384240 * 2^10 = 8761737461760 3) The size passed to ring_buffer_resize() is then divided by BUF_PAGE_SIZE which is 4080. 8761737461760 / 4080 = 2147484672 4) nr_pages is subtracted from the current nr_pages (3) and we get: 2147484669. This value is saved in a signed integer nr_pages_to_update 5) 2147484669 is greater than 2^31 but smaller than 2^32, a signed int turns into the value of -2147482627 6) As the value is a negative number, in update_pages_handler() it is negated and passed to rb_remove_pages() and 2147482627 pages will be removed, which is much larger than 3 and it causes the warning because not all the pages asked to be removed were removed. Link: https://bugzilla.kernel.org/show_bug.cgi?id=118001 Cc: stable@vger.kernel.org # 2.6.28+ Fixes: 7a8e76a3829f1 ("tracing: unified trace buffer") Reported-by: Hao Qin <QEver.cn@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2016-05-12 23:01:24 +08:00
long nr_pages;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
int bsize;
ring-buffer: Use long for nr_pages to avoid overflow failures The size variable to change the ring buffer in ftrace is a long. The nr_pages used to update the ring buffer based on the size is int. On 64 bit machines this can cause an overflow problem. For example, the following will cause the ring buffer to crash: # cd /sys/kernel/debug/tracing # echo 10 > buffer_size_kb # echo 8556384240 > buffer_size_kb Then you get the warning of: WARNING: CPU: 1 PID: 318 at kernel/trace/ring_buffer.c:1527 rb_update_pages+0x22f/0x260 Which is: RB_WARN_ON(cpu_buffer, nr_removed); Note each ring buffer page holds 4080 bytes. This is because: 1) 10 causes the ring buffer to have 3 pages. (10kb requires 3 * 4080 pages to hold) 2) (2^31 / 2^10 + 1) * 4080 = 8556384240 The value written into buffer_size_kb is shifted by 10 and then passed to ring_buffer_resize(). 8556384240 * 2^10 = 8761737461760 3) The size passed to ring_buffer_resize() is then divided by BUF_PAGE_SIZE which is 4080. 8761737461760 / 4080 = 2147484672 4) nr_pages is subtracted from the current nr_pages (3) and we get: 2147484669. This value is saved in a signed integer nr_pages_to_update 5) 2147484669 is greater than 2^31 but smaller than 2^32, a signed int turns into the value of -2147482627 6) As the value is a negative number, in update_pages_handler() it is negated and passed to rb_remove_pages() and 2147482627 pages will be removed, which is much larger than 3 and it causes the warning because not all the pages asked to be removed were removed. Link: https://bugzilla.kernel.org/show_bug.cgi?id=118001 Cc: stable@vger.kernel.org # 2.6.28+ Fixes: 7a8e76a3829f1 ("tracing: unified trace buffer") Reported-by: Hao Qin <QEver.cn@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2016-05-12 23:01:24 +08:00
int cpu;
int ret;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/* keep it in its own cache line */
buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
GFP_KERNEL);
if (!buffer)
return NULL;
if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
goto fail_free_buffer;
nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
buffer->flags = flags;
buffer->clock = trace_clock_local;
ring-buffer: pass in lockdep class key for reader_lock On Sun, 7 Jun 2009, Ingo Molnar wrote: > Testing tracer sched_switch: <6>Starting ring buffer hammer > PASSED > Testing tracer sysprof: PASSED > Testing tracer function: PASSED > Testing tracer irqsoff: > ============================================= > PASSED > Testing tracer preemptoff: PASSED > Testing tracer preemptirqsoff: [ INFO: possible recursive locking detected ] > PASSED > Testing tracer branch: 2.6.30-rc8-tip-01972-ge5b9078-dirty #5760 > --------------------------------------------- > rb_consumer/431 is trying to acquire lock: > (&cpu_buffer->reader_lock){......}, at: [<c109eef7>] ring_buffer_reset_cpu+0x37/0x70 > > but task is already holding lock: > (&cpu_buffer->reader_lock){......}, at: [<c10a019e>] ring_buffer_consume+0x7e/0xc0 > > other info that might help us debug this: > 1 lock held by rb_consumer/431: > #0: (&cpu_buffer->reader_lock){......}, at: [<c10a019e>] ring_buffer_consume+0x7e/0xc0 The ring buffer is a generic structure, and can be used outside of ftrace. If ftrace traces within the use of the ring buffer, it can produce false positives with lockdep. This patch passes in a static lock key into the allocation of the ring buffer, so that different ring buffers will have their own lock class. Reported-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <1244477919.13761.9042.camel@twins> [ store key in ring buffer descriptor ] Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-06-09 00:18:39 +08:00
buffer->reader_lock_key = key;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters);
init_waitqueue_head(&buffer->irq_work.waiters);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/* need at least two pages */
if (nr_pages < 2)
nr_pages = 2;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
buffer->cpus = nr_cpu_ids;
bsize = sizeof(void *) * nr_cpu_ids;
buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()),
GFP_KERNEL);
if (!buffer->buffers)
goto fail_free_cpumask;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
cpu = raw_smp_processor_id();
cpumask_set_cpu(cpu, buffer->cpumask);
buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
if (!buffer->buffers[cpu])
goto fail_free_buffers;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
if (ret < 0)
goto fail_free_buffers;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
mutex_init(&buffer->mutex);
return buffer;
fail_free_buffers:
for_each_buffer_cpu(buffer, cpu) {
if (buffer->buffers[cpu])
rb_free_cpu_buffer(buffer->buffers[cpu]);
}
kfree(buffer->buffers);
fail_free_cpumask:
free_cpumask_var(buffer->cpumask);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
fail_free_buffer:
kfree(buffer);
return NULL;
}
ring-buffer: pass in lockdep class key for reader_lock On Sun, 7 Jun 2009, Ingo Molnar wrote: > Testing tracer sched_switch: <6>Starting ring buffer hammer > PASSED > Testing tracer sysprof: PASSED > Testing tracer function: PASSED > Testing tracer irqsoff: > ============================================= > PASSED > Testing tracer preemptoff: PASSED > Testing tracer preemptirqsoff: [ INFO: possible recursive locking detected ] > PASSED > Testing tracer branch: 2.6.30-rc8-tip-01972-ge5b9078-dirty #5760 > --------------------------------------------- > rb_consumer/431 is trying to acquire lock: > (&cpu_buffer->reader_lock){......}, at: [<c109eef7>] ring_buffer_reset_cpu+0x37/0x70 > > but task is already holding lock: > (&cpu_buffer->reader_lock){......}, at: [<c10a019e>] ring_buffer_consume+0x7e/0xc0 > > other info that might help us debug this: > 1 lock held by rb_consumer/431: > #0: (&cpu_buffer->reader_lock){......}, at: [<c10a019e>] ring_buffer_consume+0x7e/0xc0 The ring buffer is a generic structure, and can be used outside of ftrace. If ftrace traces within the use of the ring buffer, it can produce false positives with lockdep. This patch passes in a static lock key into the allocation of the ring buffer, so that different ring buffers will have their own lock class. Reported-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <1244477919.13761.9042.camel@twins> [ store key in ring buffer descriptor ] Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-06-09 00:18:39 +08:00
EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/**
* ring_buffer_free - free a ring buffer.
* @buffer: the buffer to free.
*/
void
ring_buffer_free(struct trace_buffer *buffer)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
int cpu;
cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
ring-buffer: Sync IRQ works before buffer destruction If something was written to the buffer just before destruction, it may be possible (maybe not in a real system, but it did happen in ARCH=um with time-travel) to destroy the ringbuffer before the IRQ work ran, leading this KASAN report (or a crash without KASAN): BUG: KASAN: slab-use-after-free in irq_work_run_list+0x11a/0x13a Read of size 8 at addr 000000006d640a48 by task swapper/0 CPU: 0 PID: 0 Comm: swapper Tainted: G W O 6.3.0-rc1 #7 Stack: 60c4f20f 0c203d48 41b58ab3 60f224fc 600477fa 60f35687 60c4f20f 601273dd 00000008 6101eb00 6101eab0 615be548 Call Trace: [<60047a58>] show_stack+0x25e/0x282 [<60c609e0>] dump_stack_lvl+0x96/0xfd [<60c50d4c>] print_report+0x1a7/0x5a8 [<603078d3>] kasan_report+0xc1/0xe9 [<60308950>] __asan_report_load8_noabort+0x1b/0x1d [<60232844>] irq_work_run_list+0x11a/0x13a [<602328b4>] irq_work_tick+0x24/0x34 [<6017f9dc>] update_process_times+0x162/0x196 [<6019f335>] tick_sched_handle+0x1a4/0x1c3 [<6019fd9e>] tick_sched_timer+0x79/0x10c [<601812b9>] __hrtimer_run_queues.constprop.0+0x425/0x695 [<60182913>] hrtimer_interrupt+0x16c/0x2c4 [<600486a3>] um_timer+0x164/0x183 [...] Allocated by task 411: save_stack_trace+0x99/0xb5 stack_trace_save+0x81/0x9b kasan_save_stack+0x2d/0x54 kasan_set_track+0x34/0x3e kasan_save_alloc_info+0x25/0x28 ____kasan_kmalloc+0x8b/0x97 __kasan_kmalloc+0x10/0x12 __kmalloc+0xb2/0xe8 load_elf_phdrs+0xee/0x182 [...] The buggy address belongs to the object at 000000006d640800 which belongs to the cache kmalloc-1k of size 1024 The buggy address is located 584 bytes inside of freed 1024-byte region [000000006d640800, 000000006d640c00) Add the appropriate irq_work_sync() so the work finishes before the buffers are destroyed. Prior to the commit in the Fixes tag below, there was only a single global IRQ work, so this issue didn't exist. Link: https://lore.kernel.org/linux-trace-kernel/20230427175920.a76159263122.I8295e405c44362a86c995e9c2c37e3e03810aa56@changeid Cc: stable@vger.kernel.org Cc: Masami Hiramatsu <mhiramat@kernel.org> Fixes: 15693458c4bc ("tracing/ring-buffer: Move poll wake ups into ring buffer code") Signed-off-by: Johannes Berg <johannes.berg@intel.com> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-04-27 23:59:20 +08:00
irq_work_sync(&buffer->irq_work.work);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
for_each_buffer_cpu(buffer, cpu)
rb_free_cpu_buffer(buffer->buffers[cpu]);
kfree(buffer->buffers);
free_cpumask_var(buffer->cpumask);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
kfree(buffer);
}
EXPORT_SYMBOL_GPL(ring_buffer_free);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
void ring_buffer_set_clock(struct trace_buffer *buffer,
u64 (*clock)(void))
{
buffer->clock = clock;
}
void ring_buffer_set_time_stamp_abs(struct trace_buffer *buffer, bool abs)
{
buffer->time_stamp_abs = abs;
}
bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer)
{
return buffer->time_stamp_abs;
}
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
static inline unsigned long rb_page_entries(struct buffer_page *bpage)
{
return local_read(&bpage->entries) & RB_WRITE_MASK;
}
static inline unsigned long rb_page_write(struct buffer_page *bpage)
{
return local_read(&bpage->write) & RB_WRITE_MASK;
}
static bool
ring-buffer: Use long for nr_pages to avoid overflow failures The size variable to change the ring buffer in ftrace is a long. The nr_pages used to update the ring buffer based on the size is int. On 64 bit machines this can cause an overflow problem. For example, the following will cause the ring buffer to crash: # cd /sys/kernel/debug/tracing # echo 10 > buffer_size_kb # echo 8556384240 > buffer_size_kb Then you get the warning of: WARNING: CPU: 1 PID: 318 at kernel/trace/ring_buffer.c:1527 rb_update_pages+0x22f/0x260 Which is: RB_WARN_ON(cpu_buffer, nr_removed); Note each ring buffer page holds 4080 bytes. This is because: 1) 10 causes the ring buffer to have 3 pages. (10kb requires 3 * 4080 pages to hold) 2) (2^31 / 2^10 + 1) * 4080 = 8556384240 The value written into buffer_size_kb is shifted by 10 and then passed to ring_buffer_resize(). 8556384240 * 2^10 = 8761737461760 3) The size passed to ring_buffer_resize() is then divided by BUF_PAGE_SIZE which is 4080. 8761737461760 / 4080 = 2147484672 4) nr_pages is subtracted from the current nr_pages (3) and we get: 2147484669. This value is saved in a signed integer nr_pages_to_update 5) 2147484669 is greater than 2^31 but smaller than 2^32, a signed int turns into the value of -2147482627 6) As the value is a negative number, in update_pages_handler() it is negated and passed to rb_remove_pages() and 2147482627 pages will be removed, which is much larger than 3 and it causes the warning because not all the pages asked to be removed were removed. Link: https://bugzilla.kernel.org/show_bug.cgi?id=118001 Cc: stable@vger.kernel.org # 2.6.28+ Fixes: 7a8e76a3829f1 ("tracing: unified trace buffer") Reported-by: Hao Qin <QEver.cn@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2016-05-12 23:01:24 +08:00
rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
struct list_head *tail_page, *to_remove, *next_page;
struct buffer_page *to_remove_page, *tmp_iter_page;
struct buffer_page *last_page, *first_page;
ring-buffer: Use long for nr_pages to avoid overflow failures The size variable to change the ring buffer in ftrace is a long. The nr_pages used to update the ring buffer based on the size is int. On 64 bit machines this can cause an overflow problem. For example, the following will cause the ring buffer to crash: # cd /sys/kernel/debug/tracing # echo 10 > buffer_size_kb # echo 8556384240 > buffer_size_kb Then you get the warning of: WARNING: CPU: 1 PID: 318 at kernel/trace/ring_buffer.c:1527 rb_update_pages+0x22f/0x260 Which is: RB_WARN_ON(cpu_buffer, nr_removed); Note each ring buffer page holds 4080 bytes. This is because: 1) 10 causes the ring buffer to have 3 pages. (10kb requires 3 * 4080 pages to hold) 2) (2^31 / 2^10 + 1) * 4080 = 8556384240 The value written into buffer_size_kb is shifted by 10 and then passed to ring_buffer_resize(). 8556384240 * 2^10 = 8761737461760 3) The size passed to ring_buffer_resize() is then divided by BUF_PAGE_SIZE which is 4080. 8761737461760 / 4080 = 2147484672 4) nr_pages is subtracted from the current nr_pages (3) and we get: 2147484669. This value is saved in a signed integer nr_pages_to_update 5) 2147484669 is greater than 2^31 but smaller than 2^32, a signed int turns into the value of -2147482627 6) As the value is a negative number, in update_pages_handler() it is negated and passed to rb_remove_pages() and 2147482627 pages will be removed, which is much larger than 3 and it causes the warning because not all the pages asked to be removed were removed. Link: https://bugzilla.kernel.org/show_bug.cgi?id=118001 Cc: stable@vger.kernel.org # 2.6.28+ Fixes: 7a8e76a3829f1 ("tracing: unified trace buffer") Reported-by: Hao Qin <QEver.cn@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2016-05-12 23:01:24 +08:00
unsigned long nr_removed;
unsigned long head_bit;
int page_entries;
head_bit = 0;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
raw_spin_lock_irq(&cpu_buffer->reader_lock);
atomic_inc(&cpu_buffer->record_disabled);
/*
* We don't race with the readers since we have acquired the reader
* lock. We also don't race with writers after disabling recording.
* This makes it easy to figure out the first and the last page to be
* removed from the list. We unlink all the pages in between including
* the first and last pages. This is done in a busy loop so that we
* lose the least number of traces.
* The pages are freed after we restart recording and unlock readers.
*/
tail_page = &cpu_buffer->tail_page->list;
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
/*
* tail page might be on reader page, we remove the next page
* from the ring buffer
*/
if (cpu_buffer->tail_page == cpu_buffer->reader_page)
tail_page = rb_list_head(tail_page->next);
to_remove = tail_page;
/* start of pages to remove */
first_page = list_entry(rb_list_head(to_remove->next),
struct buffer_page, list);
for (nr_removed = 0; nr_removed < nr_pages; nr_removed++) {
to_remove = rb_list_head(to_remove)->next;
head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
/* Read iterators need to reset themselves when some pages removed */
cpu_buffer->pages_removed += nr_removed;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
next_page = rb_list_head(to_remove)->next;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/*
* Now we remove all pages between tail_page and next_page.
* Make sure that we have head_bit value preserved for the
* next page
*/
tail_page->next = (struct list_head *)((unsigned long)next_page |
head_bit);
next_page = rb_list_head(next_page);
next_page->prev = tail_page;
/* make sure pages points to a valid page in the ring buffer */
cpu_buffer->pages = next_page;
/* update head page */
if (head_bit)
cpu_buffer->head_page = list_entry(next_page,
struct buffer_page, list);
/* pages are removed, resume tracing and then free the pages */
atomic_dec(&cpu_buffer->record_disabled);
raw_spin_unlock_irq(&cpu_buffer->reader_lock);
RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages));
/* last buffer page to remove */
last_page = list_entry(rb_list_head(to_remove), struct buffer_page,
list);
tmp_iter_page = first_page;
do {
cond_resched();
to_remove_page = tmp_iter_page;
rb_inc_page(&tmp_iter_page);
/* update the counters */
page_entries = rb_page_entries(to_remove_page);
if (page_entries) {
/*
* If something was added to this page, it was full
* since it is not the tail page. So we deduct the
* bytes consumed in ring buffer from here.
* Increment overrun to account for the lost events.
*/
local_add(page_entries, &cpu_buffer->overrun);
local_sub(rb_page_commit(to_remove_page), &cpu_buffer->entries_bytes);
local_inc(&cpu_buffer->pages_lost);
}
/*
* We have already removed references to this list item, just
* free up the buffer_page and its page
*/
free_buffer_page(to_remove_page);
nr_removed--;
} while (to_remove_page != last_page);
RB_WARN_ON(cpu_buffer, nr_removed);
return nr_removed == 0;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
static bool
rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
struct list_head *pages = &cpu_buffer->new_pages;
2022-12-09 23:11:51 +08:00
unsigned long flags;
bool success;
int retries;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
2022-12-09 23:11:51 +08:00
/* Can be called at early boot up, where interrupts must not been enabled */
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
/*
* We are holding the reader lock, so the reader page won't be swapped
* in the ring buffer. Now we are racing with the writer trying to
* move head page and the tail page.
* We are going to adapt the reader page update process where:
* 1. We first splice the start and end of list of new pages between
* the head page and its previous page.
* 2. We cmpxchg the prev_page->next to point from head page to the
* start of new pages list.
* 3. Finally, we update the head->prev to the end of new list.
*
* We will try this process 10 times, to make sure that we don't keep
* spinning.
*/
retries = 10;
success = false;
while (retries--) {
struct list_head *head_page, *prev_page;
struct list_head *last_page, *first_page;
struct list_head *head_page_with_bit;
struct buffer_page *hpage = rb_set_head_page(cpu_buffer);
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
if (!hpage)
break;
head_page = &hpage->list;
prev_page = head_page->prev;
first_page = pages->next;
last_page = pages->prev;
head_page_with_bit = (struct list_head *)
((unsigned long)head_page | RB_PAGE_HEAD);
last_page->next = head_page_with_bit;
first_page->prev = prev_page;
/* caution: head_page_with_bit gets updated on cmpxchg failure */
if (try_cmpxchg(&prev_page->next,
&head_page_with_bit, first_page)) {
/*
* yay, we replaced the page pointer to our new list,
* now, we just have to update to head page's prev
* pointer to point to end of list
*/
head_page->prev = last_page;
success = true;
break;
}
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
if (success)
INIT_LIST_HEAD(pages);
/*
* If we weren't successful in adding in new pages, warn and stop
* tracing
*/
RB_WARN_ON(cpu_buffer, !success);
2022-12-09 23:11:51 +08:00
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
/* free pages if they weren't inserted */
if (!success) {
struct buffer_page *bpage, *tmp;
list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages,
list) {
list_del_init(&bpage->list);
free_buffer_page(bpage);
}
}
return success;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer)
{
bool success;
if (cpu_buffer->nr_pages_to_update > 0)
success = rb_insert_pages(cpu_buffer);
else
success = rb_remove_pages(cpu_buffer,
-cpu_buffer->nr_pages_to_update);
if (success)
cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update;
}
static void update_pages_handler(struct work_struct *work)
{
struct ring_buffer_per_cpu *cpu_buffer = container_of(work,
struct ring_buffer_per_cpu, update_pages_work);
rb_update_pages(cpu_buffer);
complete(&cpu_buffer->update_done);
}
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/**
* ring_buffer_resize - resize the ring buffer
* @buffer: the buffer to resize.
* @size: the new size.
* @cpu_id: the cpu buffer to resize
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
*
* Minimum size is 2 * BUF_PAGE_SIZE.
*
* Returns 0 on success and < 0 on failure.
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
*/
int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
int cpu_id)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
struct ring_buffer_per_cpu *cpu_buffer;
ring-buffer: Use long for nr_pages to avoid overflow failures The size variable to change the ring buffer in ftrace is a long. The nr_pages used to update the ring buffer based on the size is int. On 64 bit machines this can cause an overflow problem. For example, the following will cause the ring buffer to crash: # cd /sys/kernel/debug/tracing # echo 10 > buffer_size_kb # echo 8556384240 > buffer_size_kb Then you get the warning of: WARNING: CPU: 1 PID: 318 at kernel/trace/ring_buffer.c:1527 rb_update_pages+0x22f/0x260 Which is: RB_WARN_ON(cpu_buffer, nr_removed); Note each ring buffer page holds 4080 bytes. This is because: 1) 10 causes the ring buffer to have 3 pages. (10kb requires 3 * 4080 pages to hold) 2) (2^31 / 2^10 + 1) * 4080 = 8556384240 The value written into buffer_size_kb is shifted by 10 and then passed to ring_buffer_resize(). 8556384240 * 2^10 = 8761737461760 3) The size passed to ring_buffer_resize() is then divided by BUF_PAGE_SIZE which is 4080. 8761737461760 / 4080 = 2147484672 4) nr_pages is subtracted from the current nr_pages (3) and we get: 2147484669. This value is saved in a signed integer nr_pages_to_update 5) 2147484669 is greater than 2^31 but smaller than 2^32, a signed int turns into the value of -2147482627 6) As the value is a negative number, in update_pages_handler() it is negated and passed to rb_remove_pages() and 2147482627 pages will be removed, which is much larger than 3 and it causes the warning because not all the pages asked to be removed were removed. Link: https://bugzilla.kernel.org/show_bug.cgi?id=118001 Cc: stable@vger.kernel.org # 2.6.28+ Fixes: 7a8e76a3829f1 ("tracing: unified trace buffer") Reported-by: Hao Qin <QEver.cn@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2016-05-12 23:01:24 +08:00
unsigned long nr_pages;
int cpu, err;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/*
* Always succeed at resizing a non-existent buffer:
*/
if (!buffer)
return 0;
ring-buffer: Check for valid buffer before changing size On some machines the number of possible CPUS is not the same as the number of CPUs that is on the machine. Ftrace uses possible_cpus to update the tracing structures but the ring buffer only allocates per cpu buffers for online CPUs when they come up. When the wakeup tracer was enabled in such a case, the ftrace code enabled all possible cpu buffers, but the code in ring_buffer_resize() did not check to see if the buffer in question was allocated. Since boot up CPUs did not match possible CPUs it caused the following crash: BUG: unable to handle kernel NULL pointer dereference at 00000020 IP: [<c1097851>] ring_buffer_resize+0x16a/0x28d *pde = 00000000 Oops: 0000 [#1] PREEMPT SMP Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: [last unloaded: scsi_wait_scan] Pid: 1387, comm: bash Not tainted 3.4.0-test+ #13 /DG965MQ EIP: 0060:[<c1097851>] EFLAGS: 00010217 CPU: 0 EIP is at ring_buffer_resize+0x16a/0x28d EAX: f5a14340 EBX: f6026b80 ECX: 00000ff4 EDX: 00000ff3 ESI: 00000000 EDI: 00000002 EBP: f4275ecc ESP: f4275eb0 DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 CR0: 80050033 CR2: 00000020 CR3: 34396000 CR4: 000007d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 Process bash (pid: 1387, ti=f4274000 task=f4380cb0 task.ti=f4274000) Stack: c109cf9a f6026b98 00000162 00160f68 00000006 00160f68 00000002 f4275ef0 c109d013 f4275ee8 c123b72a c1c0bf00 c1cc81dc 00000005 f4275f98 00000007 f4275f70 c109d0c7 7700000e 75656b61 00000070 f5e90900 f5c4e198 00000301 Call Trace: [<c109cf9a>] ? tracing_set_tracer+0x115/0x1e9 [<c109d013>] tracing_set_tracer+0x18e/0x1e9 [<c123b72a>] ? _copy_from_user+0x30/0x46 [<c109d0c7>] tracing_set_trace_write+0x59/0x7f [<c10ec01e>] ? fput+0x18/0x1c6 [<c11f8732>] ? security_file_permission+0x27/0x2b [<c10eaacd>] ? rw_verify_area+0xcf/0xf2 [<c10ec01e>] ? fput+0x18/0x1c6 [<c109d06e>] ? tracing_set_tracer+0x1e9/0x1e9 [<c10ead77>] vfs_write+0x8b/0xe3 [<c10ebead>] ? fget_light+0x30/0x81 [<c10eaf54>] sys_write+0x42/0x63 [<c1834fbf>] sysenter_do_call+0x12/0x28 This happens with the latency tracer as the ftrace code updates the saved max buffer via its cpumask and not with a global setting. Adding a check in ring_buffer_resize() to make sure the buffer being resized exists, fixes the problem. Cc: Vaibhav Nagarnaik <vnagarnaik@google.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2012-05-24 03:35:17 +08:00
/* Make sure the requested buffer exists */
if (cpu_id != RING_BUFFER_ALL_CPUS &&
!cpumask_test_cpu(cpu_id, buffer->cpumask))
return 0;
ring-buffer: Check for valid buffer before changing size On some machines the number of possible CPUS is not the same as the number of CPUs that is on the machine. Ftrace uses possible_cpus to update the tracing structures but the ring buffer only allocates per cpu buffers for online CPUs when they come up. When the wakeup tracer was enabled in such a case, the ftrace code enabled all possible cpu buffers, but the code in ring_buffer_resize() did not check to see if the buffer in question was allocated. Since boot up CPUs did not match possible CPUs it caused the following crash: BUG: unable to handle kernel NULL pointer dereference at 00000020 IP: [<c1097851>] ring_buffer_resize+0x16a/0x28d *pde = 00000000 Oops: 0000 [#1] PREEMPT SMP Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: [last unloaded: scsi_wait_scan] Pid: 1387, comm: bash Not tainted 3.4.0-test+ #13 /DG965MQ EIP: 0060:[<c1097851>] EFLAGS: 00010217 CPU: 0 EIP is at ring_buffer_resize+0x16a/0x28d EAX: f5a14340 EBX: f6026b80 ECX: 00000ff4 EDX: 00000ff3 ESI: 00000000 EDI: 00000002 EBP: f4275ecc ESP: f4275eb0 DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 CR0: 80050033 CR2: 00000020 CR3: 34396000 CR4: 000007d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 Process bash (pid: 1387, ti=f4274000 task=f4380cb0 task.ti=f4274000) Stack: c109cf9a f6026b98 00000162 00160f68 00000006 00160f68 00000002 f4275ef0 c109d013 f4275ee8 c123b72a c1c0bf00 c1cc81dc 00000005 f4275f98 00000007 f4275f70 c109d0c7 7700000e 75656b61 00000070 f5e90900 f5c4e198 00000301 Call Trace: [<c109cf9a>] ? tracing_set_tracer+0x115/0x1e9 [<c109d013>] tracing_set_tracer+0x18e/0x1e9 [<c123b72a>] ? _copy_from_user+0x30/0x46 [<c109d0c7>] tracing_set_trace_write+0x59/0x7f [<c10ec01e>] ? fput+0x18/0x1c6 [<c11f8732>] ? security_file_permission+0x27/0x2b [<c10eaacd>] ? rw_verify_area+0xcf/0xf2 [<c10ec01e>] ? fput+0x18/0x1c6 [<c109d06e>] ? tracing_set_tracer+0x1e9/0x1e9 [<c10ead77>] vfs_write+0x8b/0xe3 [<c10ebead>] ? fget_light+0x30/0x81 [<c10eaf54>] sys_write+0x42/0x63 [<c1834fbf>] sysenter_do_call+0x12/0x28 This happens with the latency tracer as the ftrace code updates the saved max buffer via its cpumask and not with a global setting. Adding a check in ring_buffer_resize() to make sure the buffer being resized exists, fixes the problem. Cc: Vaibhav Nagarnaik <vnagarnaik@google.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2012-05-24 03:35:17 +08:00
ring-buffer: Prevent overflow of size in ring_buffer_resize() If the size passed to ring_buffer_resize() is greater than MAX_LONG - BUF_PAGE_SIZE then the DIV_ROUND_UP() will return zero. Here's the details: # echo 18014398509481980 > /sys/kernel/debug/tracing/buffer_size_kb tracing_entries_write() processes this and converts kb to bytes. 18014398509481980 << 10 = 18446744073709547520 and this is passed to ring_buffer_resize() as unsigned long size. size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); Where DIV_ROUND_UP(a, b) is (a + b - 1)/b BUF_PAGE_SIZE is 4080 and here 18446744073709547520 + 4080 - 1 = 18446744073709551599 where 18446744073709551599 is still smaller than 2^64 2^64 - 18446744073709551599 = 17 But now 18446744073709551599 / 4080 = 4521260802379792 and size = size * 4080 = 18446744073709551360 This is checked to make sure its still greater than 2 * 4080, which it is. Then we convert to the number of buffer pages needed. nr_page = DIV_ROUND_UP(size, BUF_PAGE_SIZE) but this time size is 18446744073709551360 and 2^64 - (18446744073709551360 + 4080 - 1) = -3823 Thus it overflows and the resulting number is less than 4080, which makes 3823 / 4080 = 0 an nr_pages is set to this. As we already checked against the minimum that nr_pages may be, this causes the logic to fail as well, and we crash the kernel. There's no reason to have the two DIV_ROUND_UP() (that's just result of historical code changes), clean up the code and fix this bug. Cc: stable@vger.kernel.org # 3.5+ Fixes: 83f40318dab00 ("ring-buffer: Make removal of ring buffer pages atomic") Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2016-05-13 21:34:12 +08:00
nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/* we need a minimum of two pages */
ring-buffer: Prevent overflow of size in ring_buffer_resize() If the size passed to ring_buffer_resize() is greater than MAX_LONG - BUF_PAGE_SIZE then the DIV_ROUND_UP() will return zero. Here's the details: # echo 18014398509481980 > /sys/kernel/debug/tracing/buffer_size_kb tracing_entries_write() processes this and converts kb to bytes. 18014398509481980 << 10 = 18446744073709547520 and this is passed to ring_buffer_resize() as unsigned long size. size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); Where DIV_ROUND_UP(a, b) is (a + b - 1)/b BUF_PAGE_SIZE is 4080 and here 18446744073709547520 + 4080 - 1 = 18446744073709551599 where 18446744073709551599 is still smaller than 2^64 2^64 - 18446744073709551599 = 17 But now 18446744073709551599 / 4080 = 4521260802379792 and size = size * 4080 = 18446744073709551360 This is checked to make sure its still greater than 2 * 4080, which it is. Then we convert to the number of buffer pages needed. nr_page = DIV_ROUND_UP(size, BUF_PAGE_SIZE) but this time size is 18446744073709551360 and 2^64 - (18446744073709551360 + 4080 - 1) = -3823 Thus it overflows and the resulting number is less than 4080, which makes 3823 / 4080 = 0 an nr_pages is set to this. As we already checked against the minimum that nr_pages may be, this causes the logic to fail as well, and we crash the kernel. There's no reason to have the two DIV_ROUND_UP() (that's just result of historical code changes), clean up the code and fix this bug. Cc: stable@vger.kernel.org # 3.5+ Fixes: 83f40318dab00 ("ring-buffer: Make removal of ring buffer pages atomic") Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2016-05-13 21:34:12 +08:00
if (nr_pages < 2)
nr_pages = 2;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/* prevent another thread from changing buffer sizes */
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
mutex_lock(&buffer->mutex);
ring-buffer: Do not swap cpu_buffer during resize process When ring_buffer_swap_cpu was called during resize process, the cpu buffer was swapped in the middle, resulting in incorrect state. Continuing to run in the wrong state will result in oops. This issue can be easily reproduced using the following two scripts: /tmp # cat test1.sh //#! /bin/sh for i in `seq 0 100000` do echo 2000 > /sys/kernel/debug/tracing/buffer_size_kb sleep 0.5 echo 5000 > /sys/kernel/debug/tracing/buffer_size_kb sleep 0.5 done /tmp # cat test2.sh //#! /bin/sh for i in `seq 0 100000` do echo irqsoff > /sys/kernel/debug/tracing/current_tracer sleep 1 echo nop > /sys/kernel/debug/tracing/current_tracer sleep 1 done /tmp # ./test1.sh & /tmp # ./test2.sh & A typical oops log is as follows, sometimes with other different oops logs. [ 231.711293] WARNING: CPU: 0 PID: 9 at kernel/trace/ring_buffer.c:2026 rb_update_pages+0x378/0x3f8 [ 231.713375] Modules linked in: [ 231.714735] CPU: 0 PID: 9 Comm: kworker/0:1 Tainted: G W 6.5.0-rc1-00276-g20edcec23f92 #15 [ 231.716750] Hardware name: linux,dummy-virt (DT) [ 231.718152] Workqueue: events update_pages_handler [ 231.719714] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 231.721171] pc : rb_update_pages+0x378/0x3f8 [ 231.722212] lr : rb_update_pages+0x25c/0x3f8 [ 231.723248] sp : ffff800082b9bd50 [ 231.724169] x29: ffff800082b9bd50 x28: ffff8000825f7000 x27: 0000000000000000 [ 231.726102] x26: 0000000000000001 x25: fffffffffffff010 x24: 0000000000000ff0 [ 231.728122] x23: ffff0000c3a0b600 x22: ffff0000c3a0b5c0 x21: fffffffffffffe0a [ 231.730203] x20: ffff0000c3a0b600 x19: ffff0000c0102400 x18: 0000000000000000 [ 231.732329] x17: 0000000000000000 x16: 0000000000000000 x15: 0000ffffe7aa8510 [ 231.734212] x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000002 [ 231.736291] x11: ffff8000826998a8 x10: ffff800082b9baf0 x9 : ffff800081137558 [ 231.738195] x8 : fffffc00030e82c8 x7 : 0000000000000000 x6 : 0000000000000001 [ 231.740192] x5 : ffff0000ffbafe00 x4 : 0000000000000000 x3 : 0000000000000000 [ 231.742118] x2 : 00000000000006aa x1 : 0000000000000001 x0 : ffff0000c0007208 [ 231.744196] Call trace: [ 231.744892] rb_update_pages+0x378/0x3f8 [ 231.745893] update_pages_handler+0x1c/0x38 [ 231.746893] process_one_work+0x1f0/0x468 [ 231.747852] worker_thread+0x54/0x410 [ 231.748737] kthread+0x124/0x138 [ 231.749549] ret_from_fork+0x10/0x20 [ 231.750434] ---[ end trace 0000000000000000 ]--- [ 233.720486] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 [ 233.721696] Mem abort info: [ 233.721935] ESR = 0x0000000096000004 [ 233.722283] EC = 0x25: DABT (current EL), IL = 32 bits [ 233.722596] SET = 0, FnV = 0 [ 233.722805] EA = 0, S1PTW = 0 [ 233.723026] FSC = 0x04: level 0 translation fault [ 233.723458] Data abort info: [ 233.723734] ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000 [ 233.724176] CM = 0, WnR = 0, TnD = 0, TagAccess = 0 [ 233.724589] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 233.725075] user pgtable: 4k pages, 48-bit VAs, pgdp=0000000104943000 [ 233.725592] [0000000000000000] pgd=0000000000000000, p4d=0000000000000000 [ 233.726231] Internal error: Oops: 0000000096000004 [#1] PREEMPT SMP [ 233.726720] Modules linked in: [ 233.727007] CPU: 0 PID: 9 Comm: kworker/0:1 Tainted: G W 6.5.0-rc1-00276-g20edcec23f92 #15 [ 233.727777] Hardware name: linux,dummy-virt (DT) [ 233.728225] Workqueue: events update_pages_handler [ 233.728655] pstate: 200000c5 (nzCv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 233.729054] pc : rb_update_pages+0x1a8/0x3f8 [ 233.729334] lr : rb_update_pages+0x154/0x3f8 [ 233.729592] sp : ffff800082b9bd50 [ 233.729792] x29: ffff800082b9bd50 x28: ffff8000825f7000 x27: 0000000000000000 [ 233.730220] x26: 0000000000000000 x25: ffff800082a8b840 x24: ffff0000c0102418 [ 233.730653] x23: 0000000000000000 x22: fffffc000304c880 x21: 0000000000000003 [ 233.731105] x20: 00000000000001f4 x19: ffff0000c0102400 x18: ffff800082fcbc58 [ 233.731727] x17: 0000000000000000 x16: 0000000000000001 x15: 0000000000000001 [ 233.732282] x14: ffff8000825fe0c8 x13: 0000000000000001 x12: 0000000000000000 [ 233.732709] x11: ffff8000826998a8 x10: 0000000000000ae0 x9 : ffff8000801b760c [ 233.733148] x8 : fefefefefefefeff x7 : 0000000000000018 x6 : ffff0000c03298c0 [ 233.733553] x5 : 0000000000000002 x4 : 0000000000000000 x3 : 0000000000000000 [ 233.733972] x2 : ffff0000c3a0b600 x1 : 0000000000000000 x0 : 0000000000000000 [ 233.734418] Call trace: [ 233.734593] rb_update_pages+0x1a8/0x3f8 [ 233.734853] update_pages_handler+0x1c/0x38 [ 233.735148] process_one_work+0x1f0/0x468 [ 233.735525] worker_thread+0x54/0x410 [ 233.735852] kthread+0x124/0x138 [ 233.736064] ret_from_fork+0x10/0x20 [ 233.736387] Code: 92400000 910006b5 aa000021 aa0303f7 (f9400060) [ 233.736959] ---[ end trace 0000000000000000 ]--- After analysis, the seq of the error is as follows [1-5]: int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, int cpu_id) { for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; //1. get cpu_buffer, aka cpu_buffer(A) ... ... schedule_work_on(cpu, &cpu_buffer->update_pages_work); //2. 'update_pages_work' is queue on 'cpu', cpu_buffer(A) is passed to // update_pages_handler, do the update process, set 'update_done' in // complete(&cpu_buffer->update_done) and to wakeup resize process. //----> //3. Just at this moment, ring_buffer_swap_cpu is triggered, //cpu_buffer(A) be swaped to cpu_buffer(B), the max_buffer. //ring_buffer_swap_cpu is called as the 'Call trace' below. Call trace: dump_backtrace+0x0/0x2f8 show_stack+0x18/0x28 dump_stack+0x12c/0x188 ring_buffer_swap_cpu+0x2f8/0x328 update_max_tr_single+0x180/0x210 check_critical_timing+0x2b4/0x2c8 tracer_hardirqs_on+0x1c0/0x200 trace_hardirqs_on+0xec/0x378 el0_svc_common+0x64/0x260 do_el0_svc+0x90/0xf8 el0_svc+0x20/0x30 el0_sync_handler+0xb0/0xb8 el0_sync+0x180/0x1c0 //<---- /* wait for all the updates to complete */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; //4. get cpu_buffer, cpu_buffer(B) is used in the following process, //the state of cpu_buffer(A) and cpu_buffer(B) is totally wrong. //for example, cpu_buffer(A)->update_done will leave be set 1, and will //not 'wait_for_completion' at the next resize round. if (!cpu_buffer->nr_pages_to_update) continue; if (cpu_online(cpu)) wait_for_completion(&cpu_buffer->update_done); cpu_buffer->nr_pages_to_update = 0; } ... } //5. the state of cpu_buffer(A) and cpu_buffer(B) is totally wrong, //Continuing to run in the wrong state, then oops occurs. Link: https://lore.kernel.org/linux-trace-kernel/202307191558478409990@zte.com.cn Signed-off-by: Chen Lin <chen.lin5@zte.com.cn> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-07-19 15:58:47 +08:00
atomic_inc(&buffer->resizing);
if (cpu_id == RING_BUFFER_ALL_CPUS) {
/*
* Don't succeed if resizing is disabled, as a reader might be
* manipulating the ring buffer and is expecting a sane state while
* this is true.
*/
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
if (atomic_read(&cpu_buffer->resize_disabled)) {
err = -EBUSY;
goto out_err_unlock;
}
}
/* calculate the pages to update */
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
cpu_buffer->nr_pages_to_update = nr_pages -
cpu_buffer->nr_pages;
/*
* nothing more to do for removing pages or no update
*/
if (cpu_buffer->nr_pages_to_update <= 0)
continue;
/*
* to add pages, make sure all new pages can be
* allocated without receiving ENOMEM
*/
INIT_LIST_HEAD(&cpu_buffer->new_pages);
if (__rb_allocate_pages(cpu_buffer, cpu_buffer->nr_pages_to_update,
&cpu_buffer->new_pages)) {
/* not enough memory for new pages */
err = -ENOMEM;
goto out_err;
}
cond_resched();
}
cpus_read_lock();
/*
* Fire off all the required work handlers
* We can't schedule on offline CPUs, but it's not necessary
* since we can change their buffer sizes without any race.
*/
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
if (!cpu_buffer->nr_pages_to_update)
continue;
/* Can't run something on an offline CPU. */
if (!cpu_online(cpu)) {
rb_update_pages(cpu_buffer);
cpu_buffer->nr_pages_to_update = 0;
} else {
2022-12-09 23:11:51 +08:00
/* Run directly if possible. */
migrate_disable();
if (cpu != smp_processor_id()) {
migrate_enable();
schedule_work_on(cpu,
&cpu_buffer->update_pages_work);
} else {
update_pages_handler(&cpu_buffer->update_pages_work);
migrate_enable();
}
}
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
/* wait for all the updates to complete */
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
if (!cpu_buffer->nr_pages_to_update)
continue;
if (cpu_online(cpu))
wait_for_completion(&cpu_buffer->update_done);
cpu_buffer->nr_pages_to_update = 0;
}
cpus_read_unlock();
} else {
cpu_buffer = buffer->buffers[cpu_id];
if (nr_pages == cpu_buffer->nr_pages)
goto out;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/*
* Don't succeed if resizing is disabled, as a reader might be
* manipulating the ring buffer and is expecting a sane state while
* this is true.
*/
if (atomic_read(&cpu_buffer->resize_disabled)) {
err = -EBUSY;
goto out_err_unlock;
}
cpu_buffer->nr_pages_to_update = nr_pages -
cpu_buffer->nr_pages;
INIT_LIST_HEAD(&cpu_buffer->new_pages);
if (cpu_buffer->nr_pages_to_update > 0 &&
__rb_allocate_pages(cpu_buffer, cpu_buffer->nr_pages_to_update,
&cpu_buffer->new_pages)) {
err = -ENOMEM;
goto out_err;
}
cpus_read_lock();
/* Can't run something on an offline CPU. */
if (!cpu_online(cpu_id))
rb_update_pages(cpu_buffer);
else {
2022-12-09 23:11:51 +08:00
/* Run directly if possible. */
migrate_disable();
if (cpu_id == smp_processor_id()) {
rb_update_pages(cpu_buffer);
migrate_enable();
} else {
migrate_enable();
schedule_work_on(cpu_id,
&cpu_buffer->update_pages_work);
wait_for_completion(&cpu_buffer->update_done);
}
}
cpu_buffer->nr_pages_to_update = 0;
cpus_read_unlock();
}
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
out:
/*
* The ring buffer resize can happen with the ring buffer
* enabled, so that the update disturbs the tracing as little
* as possible. But if the buffer is disabled, we do not need
* to worry about that, and we can take the time to verify
* that the buffer is not corrupt.
*/
if (atomic_read(&buffer->record_disabled)) {
atomic_inc(&buffer->record_disabled);
/*
* Even though the buffer was disabled, we must make sure
* that it is truly disabled before calling rb_check_pages.
* There could have been a race between checking
* record_disable and incrementing it.
*/
synchronize_rcu();
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
rb_check_pages(cpu_buffer);
}
atomic_dec(&buffer->record_disabled);
}
ring-buffer: Do not swap cpu_buffer during resize process When ring_buffer_swap_cpu was called during resize process, the cpu buffer was swapped in the middle, resulting in incorrect state. Continuing to run in the wrong state will result in oops. This issue can be easily reproduced using the following two scripts: /tmp # cat test1.sh //#! /bin/sh for i in `seq 0 100000` do echo 2000 > /sys/kernel/debug/tracing/buffer_size_kb sleep 0.5 echo 5000 > /sys/kernel/debug/tracing/buffer_size_kb sleep 0.5 done /tmp # cat test2.sh //#! /bin/sh for i in `seq 0 100000` do echo irqsoff > /sys/kernel/debug/tracing/current_tracer sleep 1 echo nop > /sys/kernel/debug/tracing/current_tracer sleep 1 done /tmp # ./test1.sh & /tmp # ./test2.sh & A typical oops log is as follows, sometimes with other different oops logs. [ 231.711293] WARNING: CPU: 0 PID: 9 at kernel/trace/ring_buffer.c:2026 rb_update_pages+0x378/0x3f8 [ 231.713375] Modules linked in: [ 231.714735] CPU: 0 PID: 9 Comm: kworker/0:1 Tainted: G W 6.5.0-rc1-00276-g20edcec23f92 #15 [ 231.716750] Hardware name: linux,dummy-virt (DT) [ 231.718152] Workqueue: events update_pages_handler [ 231.719714] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 231.721171] pc : rb_update_pages+0x378/0x3f8 [ 231.722212] lr : rb_update_pages+0x25c/0x3f8 [ 231.723248] sp : ffff800082b9bd50 [ 231.724169] x29: ffff800082b9bd50 x28: ffff8000825f7000 x27: 0000000000000000 [ 231.726102] x26: 0000000000000001 x25: fffffffffffff010 x24: 0000000000000ff0 [ 231.728122] x23: ffff0000c3a0b600 x22: ffff0000c3a0b5c0 x21: fffffffffffffe0a [ 231.730203] x20: ffff0000c3a0b600 x19: ffff0000c0102400 x18: 0000000000000000 [ 231.732329] x17: 0000000000000000 x16: 0000000000000000 x15: 0000ffffe7aa8510 [ 231.734212] x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000002 [ 231.736291] x11: ffff8000826998a8 x10: ffff800082b9baf0 x9 : ffff800081137558 [ 231.738195] x8 : fffffc00030e82c8 x7 : 0000000000000000 x6 : 0000000000000001 [ 231.740192] x5 : ffff0000ffbafe00 x4 : 0000000000000000 x3 : 0000000000000000 [ 231.742118] x2 : 00000000000006aa x1 : 0000000000000001 x0 : ffff0000c0007208 [ 231.744196] Call trace: [ 231.744892] rb_update_pages+0x378/0x3f8 [ 231.745893] update_pages_handler+0x1c/0x38 [ 231.746893] process_one_work+0x1f0/0x468 [ 231.747852] worker_thread+0x54/0x410 [ 231.748737] kthread+0x124/0x138 [ 231.749549] ret_from_fork+0x10/0x20 [ 231.750434] ---[ end trace 0000000000000000 ]--- [ 233.720486] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 [ 233.721696] Mem abort info: [ 233.721935] ESR = 0x0000000096000004 [ 233.722283] EC = 0x25: DABT (current EL), IL = 32 bits [ 233.722596] SET = 0, FnV = 0 [ 233.722805] EA = 0, S1PTW = 0 [ 233.723026] FSC = 0x04: level 0 translation fault [ 233.723458] Data abort info: [ 233.723734] ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000 [ 233.724176] CM = 0, WnR = 0, TnD = 0, TagAccess = 0 [ 233.724589] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 233.725075] user pgtable: 4k pages, 48-bit VAs, pgdp=0000000104943000 [ 233.725592] [0000000000000000] pgd=0000000000000000, p4d=0000000000000000 [ 233.726231] Internal error: Oops: 0000000096000004 [#1] PREEMPT SMP [ 233.726720] Modules linked in: [ 233.727007] CPU: 0 PID: 9 Comm: kworker/0:1 Tainted: G W 6.5.0-rc1-00276-g20edcec23f92 #15 [ 233.727777] Hardware name: linux,dummy-virt (DT) [ 233.728225] Workqueue: events update_pages_handler [ 233.728655] pstate: 200000c5 (nzCv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 233.729054] pc : rb_update_pages+0x1a8/0x3f8 [ 233.729334] lr : rb_update_pages+0x154/0x3f8 [ 233.729592] sp : ffff800082b9bd50 [ 233.729792] x29: ffff800082b9bd50 x28: ffff8000825f7000 x27: 0000000000000000 [ 233.730220] x26: 0000000000000000 x25: ffff800082a8b840 x24: ffff0000c0102418 [ 233.730653] x23: 0000000000000000 x22: fffffc000304c880 x21: 0000000000000003 [ 233.731105] x20: 00000000000001f4 x19: ffff0000c0102400 x18: ffff800082fcbc58 [ 233.731727] x17: 0000000000000000 x16: 0000000000000001 x15: 0000000000000001 [ 233.732282] x14: ffff8000825fe0c8 x13: 0000000000000001 x12: 0000000000000000 [ 233.732709] x11: ffff8000826998a8 x10: 0000000000000ae0 x9 : ffff8000801b760c [ 233.733148] x8 : fefefefefefefeff x7 : 0000000000000018 x6 : ffff0000c03298c0 [ 233.733553] x5 : 0000000000000002 x4 : 0000000000000000 x3 : 0000000000000000 [ 233.733972] x2 : ffff0000c3a0b600 x1 : 0000000000000000 x0 : 0000000000000000 [ 233.734418] Call trace: [ 233.734593] rb_update_pages+0x1a8/0x3f8 [ 233.734853] update_pages_handler+0x1c/0x38 [ 233.735148] process_one_work+0x1f0/0x468 [ 233.735525] worker_thread+0x54/0x410 [ 233.735852] kthread+0x124/0x138 [ 233.736064] ret_from_fork+0x10/0x20 [ 233.736387] Code: 92400000 910006b5 aa000021 aa0303f7 (f9400060) [ 233.736959] ---[ end trace 0000000000000000 ]--- After analysis, the seq of the error is as follows [1-5]: int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, int cpu_id) { for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; //1. get cpu_buffer, aka cpu_buffer(A) ... ... schedule_work_on(cpu, &cpu_buffer->update_pages_work); //2. 'update_pages_work' is queue on 'cpu', cpu_buffer(A) is passed to // update_pages_handler, do the update process, set 'update_done' in // complete(&cpu_buffer->update_done) and to wakeup resize process. //----> //3. Just at this moment, ring_buffer_swap_cpu is triggered, //cpu_buffer(A) be swaped to cpu_buffer(B), the max_buffer. //ring_buffer_swap_cpu is called as the 'Call trace' below. Call trace: dump_backtrace+0x0/0x2f8 show_stack+0x18/0x28 dump_stack+0x12c/0x188 ring_buffer_swap_cpu+0x2f8/0x328 update_max_tr_single+0x180/0x210 check_critical_timing+0x2b4/0x2c8 tracer_hardirqs_on+0x1c0/0x200 trace_hardirqs_on+0xec/0x378 el0_svc_common+0x64/0x260 do_el0_svc+0x90/0xf8 el0_svc+0x20/0x30 el0_sync_handler+0xb0/0xb8 el0_sync+0x180/0x1c0 //<---- /* wait for all the updates to complete */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; //4. get cpu_buffer, cpu_buffer(B) is used in the following process, //the state of cpu_buffer(A) and cpu_buffer(B) is totally wrong. //for example, cpu_buffer(A)->update_done will leave be set 1, and will //not 'wait_for_completion' at the next resize round. if (!cpu_buffer->nr_pages_to_update) continue; if (cpu_online(cpu)) wait_for_completion(&cpu_buffer->update_done); cpu_buffer->nr_pages_to_update = 0; } ... } //5. the state of cpu_buffer(A) and cpu_buffer(B) is totally wrong, //Continuing to run in the wrong state, then oops occurs. Link: https://lore.kernel.org/linux-trace-kernel/202307191558478409990@zte.com.cn Signed-off-by: Chen Lin <chen.lin5@zte.com.cn> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-07-19 15:58:47 +08:00
atomic_dec(&buffer->resizing);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
mutex_unlock(&buffer->mutex);
return 0;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
out_err:
for_each_buffer_cpu(buffer, cpu) {
struct buffer_page *bpage, *tmp;
cpu_buffer = buffer->buffers[cpu];
cpu_buffer->nr_pages_to_update = 0;
if (list_empty(&cpu_buffer->new_pages))
continue;
list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages,
list) {
list_del_init(&bpage->list);
free_buffer_page(bpage);
}
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
out_err_unlock:
ring-buffer: Do not swap cpu_buffer during resize process When ring_buffer_swap_cpu was called during resize process, the cpu buffer was swapped in the middle, resulting in incorrect state. Continuing to run in the wrong state will result in oops. This issue can be easily reproduced using the following two scripts: /tmp # cat test1.sh //#! /bin/sh for i in `seq 0 100000` do echo 2000 > /sys/kernel/debug/tracing/buffer_size_kb sleep 0.5 echo 5000 > /sys/kernel/debug/tracing/buffer_size_kb sleep 0.5 done /tmp # cat test2.sh //#! /bin/sh for i in `seq 0 100000` do echo irqsoff > /sys/kernel/debug/tracing/current_tracer sleep 1 echo nop > /sys/kernel/debug/tracing/current_tracer sleep 1 done /tmp # ./test1.sh & /tmp # ./test2.sh & A typical oops log is as follows, sometimes with other different oops logs. [ 231.711293] WARNING: CPU: 0 PID: 9 at kernel/trace/ring_buffer.c:2026 rb_update_pages+0x378/0x3f8 [ 231.713375] Modules linked in: [ 231.714735] CPU: 0 PID: 9 Comm: kworker/0:1 Tainted: G W 6.5.0-rc1-00276-g20edcec23f92 #15 [ 231.716750] Hardware name: linux,dummy-virt (DT) [ 231.718152] Workqueue: events update_pages_handler [ 231.719714] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 231.721171] pc : rb_update_pages+0x378/0x3f8 [ 231.722212] lr : rb_update_pages+0x25c/0x3f8 [ 231.723248] sp : ffff800082b9bd50 [ 231.724169] x29: ffff800082b9bd50 x28: ffff8000825f7000 x27: 0000000000000000 [ 231.726102] x26: 0000000000000001 x25: fffffffffffff010 x24: 0000000000000ff0 [ 231.728122] x23: ffff0000c3a0b600 x22: ffff0000c3a0b5c0 x21: fffffffffffffe0a [ 231.730203] x20: ffff0000c3a0b600 x19: ffff0000c0102400 x18: 0000000000000000 [ 231.732329] x17: 0000000000000000 x16: 0000000000000000 x15: 0000ffffe7aa8510 [ 231.734212] x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000002 [ 231.736291] x11: ffff8000826998a8 x10: ffff800082b9baf0 x9 : ffff800081137558 [ 231.738195] x8 : fffffc00030e82c8 x7 : 0000000000000000 x6 : 0000000000000001 [ 231.740192] x5 : ffff0000ffbafe00 x4 : 0000000000000000 x3 : 0000000000000000 [ 231.742118] x2 : 00000000000006aa x1 : 0000000000000001 x0 : ffff0000c0007208 [ 231.744196] Call trace: [ 231.744892] rb_update_pages+0x378/0x3f8 [ 231.745893] update_pages_handler+0x1c/0x38 [ 231.746893] process_one_work+0x1f0/0x468 [ 231.747852] worker_thread+0x54/0x410 [ 231.748737] kthread+0x124/0x138 [ 231.749549] ret_from_fork+0x10/0x20 [ 231.750434] ---[ end trace 0000000000000000 ]--- [ 233.720486] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 [ 233.721696] Mem abort info: [ 233.721935] ESR = 0x0000000096000004 [ 233.722283] EC = 0x25: DABT (current EL), IL = 32 bits [ 233.722596] SET = 0, FnV = 0 [ 233.722805] EA = 0, S1PTW = 0 [ 233.723026] FSC = 0x04: level 0 translation fault [ 233.723458] Data abort info: [ 233.723734] ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000 [ 233.724176] CM = 0, WnR = 0, TnD = 0, TagAccess = 0 [ 233.724589] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 233.725075] user pgtable: 4k pages, 48-bit VAs, pgdp=0000000104943000 [ 233.725592] [0000000000000000] pgd=0000000000000000, p4d=0000000000000000 [ 233.726231] Internal error: Oops: 0000000096000004 [#1] PREEMPT SMP [ 233.726720] Modules linked in: [ 233.727007] CPU: 0 PID: 9 Comm: kworker/0:1 Tainted: G W 6.5.0-rc1-00276-g20edcec23f92 #15 [ 233.727777] Hardware name: linux,dummy-virt (DT) [ 233.728225] Workqueue: events update_pages_handler [ 233.728655] pstate: 200000c5 (nzCv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 233.729054] pc : rb_update_pages+0x1a8/0x3f8 [ 233.729334] lr : rb_update_pages+0x154/0x3f8 [ 233.729592] sp : ffff800082b9bd50 [ 233.729792] x29: ffff800082b9bd50 x28: ffff8000825f7000 x27: 0000000000000000 [ 233.730220] x26: 0000000000000000 x25: ffff800082a8b840 x24: ffff0000c0102418 [ 233.730653] x23: 0000000000000000 x22: fffffc000304c880 x21: 0000000000000003 [ 233.731105] x20: 00000000000001f4 x19: ffff0000c0102400 x18: ffff800082fcbc58 [ 233.731727] x17: 0000000000000000 x16: 0000000000000001 x15: 0000000000000001 [ 233.732282] x14: ffff8000825fe0c8 x13: 0000000000000001 x12: 0000000000000000 [ 233.732709] x11: ffff8000826998a8 x10: 0000000000000ae0 x9 : ffff8000801b760c [ 233.733148] x8 : fefefefefefefeff x7 : 0000000000000018 x6 : ffff0000c03298c0 [ 233.733553] x5 : 0000000000000002 x4 : 0000000000000000 x3 : 0000000000000000 [ 233.733972] x2 : ffff0000c3a0b600 x1 : 0000000000000000 x0 : 0000000000000000 [ 233.734418] Call trace: [ 233.734593] rb_update_pages+0x1a8/0x3f8 [ 233.734853] update_pages_handler+0x1c/0x38 [ 233.735148] process_one_work+0x1f0/0x468 [ 233.735525] worker_thread+0x54/0x410 [ 233.735852] kthread+0x124/0x138 [ 233.736064] ret_from_fork+0x10/0x20 [ 233.736387] Code: 92400000 910006b5 aa000021 aa0303f7 (f9400060) [ 233.736959] ---[ end trace 0000000000000000 ]--- After analysis, the seq of the error is as follows [1-5]: int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, int cpu_id) { for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; //1. get cpu_buffer, aka cpu_buffer(A) ... ... schedule_work_on(cpu, &cpu_buffer->update_pages_work); //2. 'update_pages_work' is queue on 'cpu', cpu_buffer(A) is passed to // update_pages_handler, do the update process, set 'update_done' in // complete(&cpu_buffer->update_done) and to wakeup resize process. //----> //3. Just at this moment, ring_buffer_swap_cpu is triggered, //cpu_buffer(A) be swaped to cpu_buffer(B), the max_buffer. //ring_buffer_swap_cpu is called as the 'Call trace' below. Call trace: dump_backtrace+0x0/0x2f8 show_stack+0x18/0x28 dump_stack+0x12c/0x188 ring_buffer_swap_cpu+0x2f8/0x328 update_max_tr_single+0x180/0x210 check_critical_timing+0x2b4/0x2c8 tracer_hardirqs_on+0x1c0/0x200 trace_hardirqs_on+0xec/0x378 el0_svc_common+0x64/0x260 do_el0_svc+0x90/0xf8 el0_svc+0x20/0x30 el0_sync_handler+0xb0/0xb8 el0_sync+0x180/0x1c0 //<---- /* wait for all the updates to complete */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; //4. get cpu_buffer, cpu_buffer(B) is used in the following process, //the state of cpu_buffer(A) and cpu_buffer(B) is totally wrong. //for example, cpu_buffer(A)->update_done will leave be set 1, and will //not 'wait_for_completion' at the next resize round. if (!cpu_buffer->nr_pages_to_update) continue; if (cpu_online(cpu)) wait_for_completion(&cpu_buffer->update_done); cpu_buffer->nr_pages_to_update = 0; } ... } //5. the state of cpu_buffer(A) and cpu_buffer(B) is totally wrong, //Continuing to run in the wrong state, then oops occurs. Link: https://lore.kernel.org/linux-trace-kernel/202307191558478409990@zte.com.cn Signed-off-by: Chen Lin <chen.lin5@zte.com.cn> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-07-19 15:58:47 +08:00
atomic_dec(&buffer->resizing);
mutex_unlock(&buffer->mutex);
return err;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
EXPORT_SYMBOL_GPL(ring_buffer_resize);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
void ring_buffer_change_overwrite(struct trace_buffer *buffer, int val)
{
mutex_lock(&buffer->mutex);
if (val)
buffer->flags |= RB_FL_OVERWRITE;
else
buffer->flags &= ~RB_FL_OVERWRITE;
mutex_unlock(&buffer->mutex);
}
EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite);
static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
return bpage->page->data + index;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
static __always_inline struct ring_buffer_event *
rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
return __rb_page_index(cpu_buffer->reader_page,
cpu_buffer->reader_page->read);
}
static struct ring_buffer_event *
rb_iter_head_event(struct ring_buffer_iter *iter)
{
struct ring_buffer_event *event;
struct buffer_page *iter_head_page = iter->head_page;
unsigned long commit;
unsigned length;
if (iter->head != iter->next_event)
return iter->event;
/*
* When the writer goes across pages, it issues a cmpxchg which
* is a mb(), which will synchronize with the rmb here.
* (see rb_tail_page_update() and __rb_reserve_next())
*/
commit = rb_page_commit(iter_head_page);
smp_rmb();
/* An event needs to be at least 8 bytes in size */
if (iter->head > commit - 8)
goto reset;
event = __rb_page_index(iter_head_page, iter->head);
length = rb_event_length(event);
/*
* READ_ONCE() doesn't work on functions and we don't want the
* compiler doing any crazy optimizations with length.
*/
barrier();
if ((iter->head + length) > commit || length > BUF_PAGE_SIZE)
/* Writer corrupted the read? */
goto reset;
memcpy(iter->event, event, length);
/*
* If the page stamp is still the same after this rmb() then the
* event was safely copied without the writer entering the page.
*/
smp_rmb();
/* Make sure the page didn't change since we read this */
if (iter->page_stamp != iter_head_page->page->time_stamp ||
commit > rb_page_commit(iter_head_page))
goto reset;
iter->next_event = iter->head + length;
return iter->event;
reset:
/* Reset to the beginning */
iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp;
iter->head = 0;
iter->next_event = 0;
iter->missed_events = 1;
return NULL;
}
/* Size is determined by what has been committed */
static __always_inline unsigned rb_page_size(struct buffer_page *bpage)
{
return rb_page_commit(bpage);
}
static __always_inline unsigned
rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
{
return rb_page_commit(cpu_buffer->commit_page);
}
static __always_inline unsigned
rb_event_index(struct ring_buffer_event *event)
{
unsigned long addr = (unsigned long)event;
return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
}
static void rb_inc_iter(struct ring_buffer_iter *iter)
{
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
/*
* The iterator could be on the reader page (it starts there).
* But the head could have moved, since the reader was
* found. Check for this case and assign the iterator
* to the head page instead of next.
*/
if (iter->head_page == cpu_buffer->reader_page)
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
iter->head_page = rb_set_head_page(cpu_buffer);
else
rb_inc_page(&iter->head_page);
iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
iter->head = 0;
iter->next_event = 0;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
/*
* rb_handle_head_page - writer hit the head page
*
* Returns: +1 to retry page
* 0 to continue
* -1 on error
*/
static int
rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *tail_page,
struct buffer_page *next_page)
{
struct buffer_page *new_head;
int entries;
int type;
int ret;
entries = rb_page_entries(next_page);
/*
* The hard part is here. We need to move the head
* forward, and protect against both readers on
* other CPUs and writers coming in via interrupts.
*/
type = rb_head_page_set_update(cpu_buffer, next_page, tail_page,
RB_PAGE_HEAD);
/*
* type can be one of four:
* NORMAL - an interrupt already moved it for us
* HEAD - we are the first to get here.
* UPDATE - we are the interrupt interrupting
* a current move.
* MOVED - a reader on another CPU moved the next
* pointer to its reader page. Give up
* and try again.
*/
switch (type) {
case RB_PAGE_HEAD:
/*
* We changed the head to UPDATE, thus
* it is our responsibility to update
* the counters.
*/
local_add(entries, &cpu_buffer->overrun);
local_sub(rb_page_commit(next_page), &cpu_buffer->entries_bytes);
local_inc(&cpu_buffer->pages_lost);
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
/*
* The entries will be zeroed out when we move the
* tail page.
*/
/* still more to do */
break;
case RB_PAGE_UPDATE:
/*
* This is an interrupt that interrupt the
* previous update. Still more to do.
*/
break;
case RB_PAGE_NORMAL:
/*
* An interrupt came in before the update
* and processed this for us.
* Nothing left to do.
*/
return 1;
case RB_PAGE_MOVED:
/*
* The reader is on another CPU and just did
* a swap with our next_page.
* Try again.
*/
return 1;
default:
RB_WARN_ON(cpu_buffer, 1); /* WTF??? */
return -1;
}
/*
* Now that we are here, the old head pointer is
* set to UPDATE. This will keep the reader from
* swapping the head page with the reader page.
* The reader (on another CPU) will spin till
* we are finished.
*
* We just need to protect against interrupts
* doing the job. We will set the next pointer
* to HEAD. After that, we set the old pointer
* to NORMAL, but only if it was HEAD before.
* otherwise we are an interrupt, and only
* want the outer most commit to reset it.
*/
new_head = next_page;
rb_inc_page(&new_head);
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
ret = rb_head_page_set_head(cpu_buffer, new_head, next_page,
RB_PAGE_NORMAL);
/*
* Valid returns are:
* HEAD - an interrupt came in and already set it.
* NORMAL - One of two things:
* 1) We really set it.
* 2) A bunch of interrupts came in and moved
* the page forward again.
*/
switch (ret) {
case RB_PAGE_HEAD:
case RB_PAGE_NORMAL:
/* OK */
break;
default:
RB_WARN_ON(cpu_buffer, 1);
return -1;
}
/*
* It is possible that an interrupt came in,
* set the head up, then more interrupts came in
* and moved it again. When we get back here,
* the page would have been set to NORMAL but we
* just set it back to HEAD.
*
* How do you detect this? Well, if that happened
* the tail page would have moved.
*/
if (ret == RB_PAGE_NORMAL) {
struct buffer_page *buffer_tail_page;
buffer_tail_page = READ_ONCE(cpu_buffer->tail_page);
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
/*
* If the tail had moved passed next, then we need
* to reset the pointer.
*/
if (buffer_tail_page != tail_page &&
buffer_tail_page != next_page)
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
rb_head_page_set_normal(cpu_buffer, new_head,
next_page,
RB_PAGE_HEAD);
}
/*
* If this was the outer most commit (the one that
* changed the original pointer from HEAD to UPDATE),
* then it is up to us to reset it to NORMAL.
*/
if (type == RB_PAGE_HEAD) {
ret = rb_head_page_set_normal(cpu_buffer, next_page,
tail_page,
RB_PAGE_UPDATE);
if (RB_WARN_ON(cpu_buffer,
ret != RB_PAGE_UPDATE))
return -1;
}
return 0;
}
static inline void
rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
unsigned long tail, struct rb_event_info *info)
{
struct buffer_page *tail_page = info->tail_page;
struct ring_buffer_event *event;
unsigned long length = info->length;
/*
* Only the event that crossed the page boundary
* must fill the old tail_page with padding.
*/
if (tail >= BUF_PAGE_SIZE) {
/*
* If the page was filled, then we still need
* to update the real_end. Reset it to zero
* and the reader will ignore it.
*/
if (tail == BUF_PAGE_SIZE)
tail_page->real_end = 0;
local_sub(length, &tail_page->write);
return;
}
event = __rb_page_index(tail_page, tail);
/*
* Save the original length to the meta data.
* This will be used by the reader to add lost event
* counter.
*/
tail_page->real_end = tail;
/*
* If this event is bigger than the minimum size, then
* we need to be careful that we don't subtract the
* write counter enough to allow another writer to slip
* in on this page.
* We put in a discarded commit instead, to make sure
* that this space is not used again, and this space will
* not be accounted into 'entries_bytes'.
*
* If we are less than the minimum size, we don't need to
* worry about it.
*/
if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) {
/* No room for any events */
/* Mark the rest of the page with padding */
rb_event_set_padding(event);
ring-buffer: Fix race between reset page and reading page The ring buffer is broken up into sub buffers (currently of page size). Each sub buffer has a pointer to its "tail" (the last event written to the sub buffer). When a new event is requested, the tail is locally incremented to cover the size of the new event. This is done in a way that there is no need for locking. If the tail goes past the end of the sub buffer, the process of moving to the next sub buffer takes place. After setting the current sub buffer to the next one, the previous one that had the tail go passed the end of the sub buffer needs to be reset back to the original tail location (before the new event was requested) and the rest of the sub buffer needs to be "padded". The race happens when a reader takes control of the sub buffer. As readers do a "swap" of sub buffers from the ring buffer to get exclusive access to the sub buffer, it replaces the "head" sub buffer with an empty sub buffer that goes back into the writable portion of the ring buffer. This swap can happen as soon as the writer moves to the next sub buffer and before it updates the last sub buffer with padding. Because the sub buffer can be released to the reader while the writer is still updating the padding, it is possible for the reader to see the event that goes past the end of the sub buffer. This can cause obvious issues. To fix this, add a few memory barriers so that the reader definitely sees the updates to the sub buffer, and also waits until the writer has put back the "tail" of the sub buffer back to the last event that was written on it. To be paranoid, it will only spin for 1 second, otherwise it will warn and shutdown the ring buffer code. 1 second should be enough as the writer does have preemption disabled. If the writer doesn't move within 1 second (with preemption disabled) something is horribly wrong. No interrupt should last 1 second! Link: https://lore.kernel.org/all/20220830120854.7545-1-jiazi.li@transsion.com/ Link: https://bugzilla.kernel.org/show_bug.cgi?id=216369 Link: https://lkml.kernel.org/r/20220929104909.0650a36c@gandalf.local.home Cc: Ingo Molnar <mingo@kernel.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: stable@vger.kernel.org Fixes: c7b0930857e22 ("ring-buffer: prevent adding write in discarded area") Reported-by: Jiazi.Li <jiazi.li@transsion.com> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2022-09-29 22:49:09 +08:00
/* Make sure the padding is visible before the write update */
smp_wmb();
/* Set the write back to the previous setting */
local_sub(length, &tail_page->write);
return;
}
/* Put in a discarded event */
event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE;
event->type_len = RINGBUF_TYPE_PADDING;
/* time delta must be non zero */
event->time_delta = 1;
/* account for padding bytes */
local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
ring-buffer: Fix race between reset page and reading page The ring buffer is broken up into sub buffers (currently of page size). Each sub buffer has a pointer to its "tail" (the last event written to the sub buffer). When a new event is requested, the tail is locally incremented to cover the size of the new event. This is done in a way that there is no need for locking. If the tail goes past the end of the sub buffer, the process of moving to the next sub buffer takes place. After setting the current sub buffer to the next one, the previous one that had the tail go passed the end of the sub buffer needs to be reset back to the original tail location (before the new event was requested) and the rest of the sub buffer needs to be "padded". The race happens when a reader takes control of the sub buffer. As readers do a "swap" of sub buffers from the ring buffer to get exclusive access to the sub buffer, it replaces the "head" sub buffer with an empty sub buffer that goes back into the writable portion of the ring buffer. This swap can happen as soon as the writer moves to the next sub buffer and before it updates the last sub buffer with padding. Because the sub buffer can be released to the reader while the writer is still updating the padding, it is possible for the reader to see the event that goes past the end of the sub buffer. This can cause obvious issues. To fix this, add a few memory barriers so that the reader definitely sees the updates to the sub buffer, and also waits until the writer has put back the "tail" of the sub buffer back to the last event that was written on it. To be paranoid, it will only spin for 1 second, otherwise it will warn and shutdown the ring buffer code. 1 second should be enough as the writer does have preemption disabled. If the writer doesn't move within 1 second (with preemption disabled) something is horribly wrong. No interrupt should last 1 second! Link: https://lore.kernel.org/all/20220830120854.7545-1-jiazi.li@transsion.com/ Link: https://bugzilla.kernel.org/show_bug.cgi?id=216369 Link: https://lkml.kernel.org/r/20220929104909.0650a36c@gandalf.local.home Cc: Ingo Molnar <mingo@kernel.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: stable@vger.kernel.org Fixes: c7b0930857e22 ("ring-buffer: prevent adding write in discarded area") Reported-by: Jiazi.Li <jiazi.li@transsion.com> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2022-09-29 22:49:09 +08:00
/* Make sure the padding is visible before the tail_page->write update */
smp_wmb();
/* Set write to end of buffer */
length = (tail + length) - BUF_PAGE_SIZE;
local_sub(length, &tail_page->write);
}
static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer);
/*
* This is the slow path, force gcc not to inline it.
*/
static noinline struct ring_buffer_event *
rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
unsigned long tail, struct rb_event_info *info)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
struct buffer_page *tail_page = info->tail_page;
struct buffer_page *commit_page = cpu_buffer->commit_page;
struct trace_buffer *buffer = cpu_buffer->buffer;
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
struct buffer_page *next_page;
int ret;
next_page = tail_page;
rb_inc_page(&next_page);
/*
* If for some reason, we had an interrupt storm that made
* it all the way around the buffer, bail, and warn
* about it.
*/
if (unlikely(next_page == commit_page)) {
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
local_inc(&cpu_buffer->commit_overrun);
goto out_reset;
}
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
/*
* This is where the fun begins!
*
* We are fighting against races between a reader that
* could be on another CPU trying to swap its reader
* page with the buffer head.
*
* We are also fighting against interrupts coming in and
* moving the head or tail on us as well.
*
* If the next page is the head page then we have filled
* the buffer, unless the commit page is still on the
* reader page.
*/
if (rb_is_head_page(next_page, &tail_page->list)) {
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
/*
* If the commit is not on the reader page, then
* move the header page.
*/
if (!rb_is_reader_page(cpu_buffer->commit_page)) {
/*
* If we are not in overwrite mode,
* this is easy, just stop here.
*/
if (!(buffer->flags & RB_FL_OVERWRITE)) {
local_inc(&cpu_buffer->dropped_events);
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
goto out_reset;
}
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
ret = rb_handle_head_page(cpu_buffer,
tail_page,
next_page);
if (ret < 0)
goto out_reset;
if (ret)
goto out_again;
} else {
/*
* We need to be careful here too. The
* commit page could still be on the reader
* page. We could have a small buffer, and
* have filled up the buffer with events
* from interrupts and such, and wrapped.
*
* Note, if the tail page is also on the
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
* reader_page, we let it move out.
*/
if (unlikely((cpu_buffer->commit_page !=
cpu_buffer->tail_page) &&
(cpu_buffer->commit_page ==
cpu_buffer->reader_page))) {
local_inc(&cpu_buffer->commit_overrun);
goto out_reset;
}
}
}
rb_tail_page_update(cpu_buffer, tail_page, next_page);
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
out_again:
rb_reset_tail(cpu_buffer, tail, info);
/* Commit what we have for now. */
rb_end_commit(cpu_buffer);
/* rb_end_commit() decs committing */
local_inc(&cpu_buffer->committing);
/* fail and let the caller try again */
return ERR_PTR(-EAGAIN);
out_reset:
/* reset write */
rb_reset_tail(cpu_buffer, tail, info);
return NULL;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
/* Slow path */
static struct ring_buffer_event *
rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
{
if (abs)
event->type_len = RINGBUF_TYPE_TIME_STAMP;
else
event->type_len = RINGBUF_TYPE_TIME_EXTEND;
/* Not the first event on the page, or not delta? */
if (abs || rb_event_index(event)) {
event->time_delta = delta & TS_MASK;
event->array[0] = delta >> TS_SHIFT;
} else {
/* nope, just zero it */
event->time_delta = 0;
event->array[0] = 0;
}
return skip_time_extend(event);
}
#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
static inline bool sched_clock_stable(void)
{
return true;
}
#endif
static void
rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
struct rb_event_info *info)
{
u64 write_stamp;
ring-buffer: Do not trigger a WARN if clock going backwards is detected After tweaking the ring buffer to be a bit faster, a warning is triggering on one of my machines, and causing my tests to fail. This warning is caused when the delta (current time stamp minus previous time stamp), is larger than the max time held by the ring buffer (59 bits). If the clock were to go backwards slightly, this would then easily trigger this warning. The machine that it triggered on, the clock did go backwards by around 450 nanoseconds, and this happened after a recalibration of the TSC clock. Now that the ring buffer is faster, it detects this, and the delta that is used larger than the max, the warning is triggered and my test fails. To handle the clock going backwards, look at the saved before and after time stamps. If they are the same, it means that the current event did not interrupt another event, and that those timestamp are of a previous event that was recorded. If the max delta is triggered, look at those time stamps, make sure they are the same, then use them to compare with the current timestamp. If the current timestamp is less than the before/after time stamps, then that means the clock being used went backward. Print out a message that this has happened, but do not warn about it (and only print the message once). Still do the warning if the delta is indeed larger than what can be used. Also remove the unneeded KERN_WARNING from the WARN_ONCE() print. Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-07-02 01:10:19 +08:00
WARN_ONCE(1, "Delta way too big! %llu ts=%llu before=%llu after=%llu write stamp=%llu\n%s",
(unsigned long long)info->delta,
(unsigned long long)info->ts,
(unsigned long long)info->before,
(unsigned long long)info->after,
(unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0),
sched_clock_stable() ? "" :
"If you just came from a suspend/resume,\n"
"please switch to the trace global clock:\n"
" echo global > /sys/kernel/tracing/trace_clock\n"
"or add trace_clock=global to the kernel command line\n");
}
static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event **event,
struct rb_event_info *info,
u64 *delta,
unsigned int *length)
{
bool abs = info->add_timestamp &
(RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE);
ring-buffer: Do not trigger a WARN if clock going backwards is detected After tweaking the ring buffer to be a bit faster, a warning is triggering on one of my machines, and causing my tests to fail. This warning is caused when the delta (current time stamp minus previous time stamp), is larger than the max time held by the ring buffer (59 bits). If the clock were to go backwards slightly, this would then easily trigger this warning. The machine that it triggered on, the clock did go backwards by around 450 nanoseconds, and this happened after a recalibration of the TSC clock. Now that the ring buffer is faster, it detects this, and the delta that is used larger than the max, the warning is triggered and my test fails. To handle the clock going backwards, look at the saved before and after time stamps. If they are the same, it means that the current event did not interrupt another event, and that those timestamp are of a previous event that was recorded. If the max delta is triggered, look at those time stamps, make sure they are the same, then use them to compare with the current timestamp. If the current timestamp is less than the before/after time stamps, then that means the clock being used went backward. Print out a message that this has happened, but do not warn about it (and only print the message once). Still do the warning if the delta is indeed larger than what can be used. Also remove the unneeded KERN_WARNING from the WARN_ONCE() print. Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-07-02 01:10:19 +08:00
if (unlikely(info->delta > (1ULL << 59))) {
/*
* Some timers can use more than 59 bits, and when a timestamp
* is added to the buffer, it will lose those bits.
*/
if (abs && (info->ts & TS_MSB)) {
info->delta &= ABS_TS_MASK;
ring-buffer: Do not trigger a WARN if clock going backwards is detected After tweaking the ring buffer to be a bit faster, a warning is triggering on one of my machines, and causing my tests to fail. This warning is caused when the delta (current time stamp minus previous time stamp), is larger than the max time held by the ring buffer (59 bits). If the clock were to go backwards slightly, this would then easily trigger this warning. The machine that it triggered on, the clock did go backwards by around 450 nanoseconds, and this happened after a recalibration of the TSC clock. Now that the ring buffer is faster, it detects this, and the delta that is used larger than the max, the warning is triggered and my test fails. To handle the clock going backwards, look at the saved before and after time stamps. If they are the same, it means that the current event did not interrupt another event, and that those timestamp are of a previous event that was recorded. If the max delta is triggered, look at those time stamps, make sure they are the same, then use them to compare with the current timestamp. If the current timestamp is less than the before/after time stamps, then that means the clock being used went backward. Print out a message that this has happened, but do not warn about it (and only print the message once). Still do the warning if the delta is indeed larger than what can be used. Also remove the unneeded KERN_WARNING from the WARN_ONCE() print. Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-07-02 01:10:19 +08:00
/* did the clock go backwards */
} else if (info->before == info->after && info->before > info->ts) {
ring-buffer: Do not trigger a WARN if clock going backwards is detected After tweaking the ring buffer to be a bit faster, a warning is triggering on one of my machines, and causing my tests to fail. This warning is caused when the delta (current time stamp minus previous time stamp), is larger than the max time held by the ring buffer (59 bits). If the clock were to go backwards slightly, this would then easily trigger this warning. The machine that it triggered on, the clock did go backwards by around 450 nanoseconds, and this happened after a recalibration of the TSC clock. Now that the ring buffer is faster, it detects this, and the delta that is used larger than the max, the warning is triggered and my test fails. To handle the clock going backwards, look at the saved before and after time stamps. If they are the same, it means that the current event did not interrupt another event, and that those timestamp are of a previous event that was recorded. If the max delta is triggered, look at those time stamps, make sure they are the same, then use them to compare with the current timestamp. If the current timestamp is less than the before/after time stamps, then that means the clock being used went backward. Print out a message that this has happened, but do not warn about it (and only print the message once). Still do the warning if the delta is indeed larger than what can be used. Also remove the unneeded KERN_WARNING from the WARN_ONCE() print. Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-07-02 01:10:19 +08:00
/* not interrupted */
static int once;
/*
* This is possible with a recalibrating of the TSC.
* Do not produce a call stack, but just report it.
*/
if (!once) {
once++;
pr_warn("Ring buffer clock went backwards: %llu -> %llu\n",
info->before, info->ts);
}
} else
rb_check_timestamp(cpu_buffer, info);
if (!abs)
info->delta = 0;
}
*event = rb_add_time_stamp(*event, info->delta, abs);
*length -= RB_LEN_TIME_EXTEND;
*delta = 0;
}
/**
* rb_update_event - update event type and data
* @cpu_buffer: The per cpu buffer of the @event
* @event: the event to update
* @info: The info to update the @event with (contains length and delta)
*
* Update the type and data fields of the @event. The length
* is the actual size that is written to the ring buffer,
* and with this, we can determine what to place into the
* data field.
*/
static void
rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event,
struct rb_event_info *info)
{
unsigned length = info->length;
u64 delta = info->delta;
unsigned int nest = local_read(&cpu_buffer->committing) - 1;
if (!WARN_ON_ONCE(nest >= MAX_NEST))
cpu_buffer->event_stamp[nest] = info->ts;
/*
* If we need to add a timestamp, then we
* add it to the start of the reserved space.
*/
if (unlikely(info->add_timestamp))
rb_add_timestamp(cpu_buffer, &event, info, &delta, &length);
event->time_delta = delta;
length -= RB_EVNT_HDR_SIZE;
if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
event->type_len = 0;
event->array[0] = length;
} else
event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
}
static unsigned rb_calculate_event_length(unsigned length)
{
struct ring_buffer_event event; /* Used only for sizeof array */
/* zero length can cause confusions */
if (!length)
length++;
if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
length += sizeof(event.array[0]);
length += RB_EVNT_HDR_SIZE;
length = ALIGN(length, RB_ARCH_ALIGNMENT);
/*
* In case the time delta is larger than the 27 bits for it
* in the header, we need to add a timestamp. If another
* event comes in when trying to discard this one to increase
* the length, then the timestamp will be added in the allocated
* space of this event. If length is bigger than the size needed
* for the TIME_EXTEND, then padding has to be used. The events
* length must be either RB_LEN_TIME_EXTEND, or greater than or equal
* to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding.
* As length is a multiple of 4, we only need to worry if it
* is 12 (RB_LEN_TIME_EXTEND + 4).
*/
if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT)
length += RB_ALIGNMENT;
return length;
}
static inline bool
rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event)
{
unsigned long new_index, old_index;
struct buffer_page *bpage;
unsigned long addr;
new_index = rb_event_index(event);
old_index = new_index + rb_event_ts_length(event);
addr = (unsigned long)event;
addr &= PAGE_MASK;
bpage = READ_ONCE(cpu_buffer->tail_page);
ring-buffer: Remove useless update to write_stamp in rb_try_to_discard() When filtering is enabled, a temporary buffer is created to place the content of the trace event output so that the filter logic can decide from the trace event output if the trace event should be filtered out or not. If it is to be filtered out, the content in the temporary buffer is simply discarded, otherwise it is written into the trace buffer. But if an interrupt were to come in while a previous event was using that temporary buffer, the event written by the interrupt would actually go into the ring buffer itself to prevent corrupting the data on the temporary buffer. If the event is to be filtered out, the event in the ring buffer is discarded, or if it fails to discard because another event were to have already come in, it is turned into padding. The update to the write_stamp in the rb_try_to_discard() happens after a fix was made to force the next event after the discard to use an absolute timestamp by setting the before_stamp to zero so it does not match the write_stamp (which causes an event to use the absolute timestamp). But there's an effort in rb_try_to_discard() to put back the write_stamp to what it was before the event was added. But this is useless and wasteful because nothing is going to be using that write_stamp for calculations as it still will not match the before_stamp. Remove this useless update, and in doing so, we remove another cmpxchg64()! Also update the comments to reflect this change as well as remove some extra white space in another comment. Link: https://lore.kernel.org/linux-trace-kernel/20231215081810.1f4f38fe@rorschach.local.home Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Joel Fernandes <joel@joelfernandes.org> Cc: Vincent Donnefort <vdonnefort@google.com> Fixes: b2dd797543cf ("ring-buffer: Force absolute timestamp on discard of event") Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-12-15 21:18:10 +08:00
/*
* Make sure the tail_page is still the same and
* the next write location is the end of this event
*/
if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
unsigned long write_mask =
local_read(&bpage->write) & ~RB_WRITE_MASK;
unsigned long event_length = rb_event_length(event);
ring-buffer: Have nested events still record running time stamp Up until now, if an event is interrupted while it is recorded by an interrupt, and that interrupt records events, the time of those events will all be the same. This is because events only record the delta of the time since the previous event (or beginning of a page), and to handle updating the time keeping for that of nested events is extremely racy. After years of thinking about this and several failed attempts, I finally have a solution to solve this puzzle. The problem is that you need to atomically calculate the delta and then update the time stamp you made the delta from, as well as then record it into the buffer, all this while at any time an interrupt can come in and do the same thing. This is easy to solve with heavy weight atomics, but that would be detrimental to the performance of the ring buffer. The current state of affairs sacrificed the time deltas for nested events for performance. The reason for previous failed attempts at solving this puzzle was because I was trying to completely avoid slow atomic operations like cmpxchg. I final came to the conclusion to always avoid cmpxchg is not possible, which is why those previous attempts always failed. But it is possible to pick one path (the most common case) and avoid cmpxchg in that path, which is the "fast path". The most common case is that an event will not be interrupted and have other events added into it. An event can detect if it has interrupted another event, and for these cases we can make it the slow path and use the heavy operations like cmpxchg. One more player was added to the game that made this possible, and that is the "absolute timestamp" (by Tom Zanussi) that allows us to inject a full 59 bit time stamp. (Of course this breaks if a machine is running for more than 18 years without a reboot!). There's barrier() placements around for being paranoid, even when they are not needed because of other atomic functions near by. But those should not hurt, as if they are not needed, they basically become a nop. Note, this also makes the race window much smaller, which means there are less slow paths to slow down the performance. The basic idea is that there's two main paths taken. 1) Not being interrupted between time stamps and reserving buffer space. In this case, the time stamps taken are true to the location in the buffer. 2) Was interrupted by another path between taking time stamps and reserving buffer space. The objective is to know what the delta is from the last reserved location in the buffer. As it is possible to detect if an event is interrupting another event before reserving data, space is added to the length to be reserved to inject a full time stamp along with the event being reserved. When an event is not interrupted, the write stamp is always the time of the last event written to the buffer. In path 1, there's two sub paths we care about: a) The event did not interrupt another event. b) The event interrupted another event. In case a, as the write stamp was read and known to be correct, the delta between the current time stamp and the write stamp is the delta between the current event and the previously recorded event. In case b, extra space was reserved to just put the full time stamp into the buffer. Which is done, as stated, in this path the time stamp taken is known to match the location in the buffer. In path 2, there's also two sub paths we care about: a) The event was not interrupted by another event since it reserved space on the buffer and re-reading the write stamp. b) The event was interrupted by another event. In case a, the write stamp is that of the last event that interrupted this event between taking the time stamps and reserving. As no event came in after re-reading the write stamp, that event is known to be the time of the event directly before this event and the delta can be the new time stamp and the write stamp. In case b, one or more events came in between reserving the event and re-reading he write stamp. Since this event's buffer reservation is between other events at this path, there's no way to know what the delta is. But because an event interrupted this event after it started, its fine to just give a zero delta, and take the same time stamp as the events that happened within the event being recorded. Here's the implementation of the design of this solution: All this is per cpu, and only needs to worry about nested events (not parallel events). The players: write_tail: The index in the buffer where new events can be written to. It is incremented via local_add() to reserve space for a new event. before_stamp: A time stamp set by all events before reserving space. write_stamp: A time stamp updated by events after it has successfully reserved space. /* Save the current position of write */ [A] w = local_read(write_tail); barrier(); /* Read both before and write stamps before touching anything */ before = local_read(before_stamp); after = local_read(write_stamp); barrier(); /* * If before and after are the same, then this event is not * interrupting a time update. If it is, then reserve space for adding * a full time stamp (this can turn into a time extend which is * just an extended time delta but fill up the extra space). */ if (after != before) abs = true; ts = clock(); /* Now update the before_stamp (everyone does this!) */ [B] local_set(before_stamp, ts); /* Now reserve space on the buffer */ [C] write = local_add_return(len, write_tail); /* Set tail to be were this event's data is */ tail = write - len; if (w == tail) { /* Nothing interrupted this between A and C */ [D] local_set(write_stamp, ts); barrier(); [E] save_before = local_read(before_stamp); if (!abs) { /* This did not interrupt a time update */ delta = ts - after; } else { delta = ts; /* The full time stamp will be in use */ } if (ts != save_before) { /* slow path - Was interrupted between C and E */ /* The update to write_stamp could have overwritten the update to * it by the interrupting event, but before and after should be * the same for all completed top events */ after = local_read(write_stamp); if (save_before > after) local_cmpxchg(write_stamp, after, save_before); } } else { /* slow path - Interrupted between A and C */ after = local_read(write_stamp); temp_ts = clock(); barrier(); [F] if (write == local_read(write_tail) && after < temp_ts) { /* This was not interrupted since C and F * The last write_stamp is still valid for the previous event * in the buffer. */ delta = temp_ts - after; /* OK to keep this new time stamp */ ts = temp_ts; } else { /* Interrupted between C and F * Well, there's no use to try to know what the time stamp * is for the previous event. Just set delta to zero and * be the same time as that event that interrupted us before * the reservation of the buffer. */ delta = 0; } /* No need to use full timestamps here */ abs = 0; } Link: https://lkml.kernel.org/r/20200625094454.732790f7@oasis.local.home Link: https://lore.kernel.org/r/20200627010041.517736087@goodmis.org Link: http://lkml.kernel.org/r/20200629025258.957440797@goodmis.org Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:25 +08:00
/*
* For the before_stamp to be different than the write_stamp
* to make sure that the next event adds an absolute
* value and does not rely on the saved write stamp, which
* is now going to be bogus.
ring-buffer: Remove useless update to write_stamp in rb_try_to_discard() When filtering is enabled, a temporary buffer is created to place the content of the trace event output so that the filter logic can decide from the trace event output if the trace event should be filtered out or not. If it is to be filtered out, the content in the temporary buffer is simply discarded, otherwise it is written into the trace buffer. But if an interrupt were to come in while a previous event was using that temporary buffer, the event written by the interrupt would actually go into the ring buffer itself to prevent corrupting the data on the temporary buffer. If the event is to be filtered out, the event in the ring buffer is discarded, or if it fails to discard because another event were to have already come in, it is turned into padding. The update to the write_stamp in the rb_try_to_discard() happens after a fix was made to force the next event after the discard to use an absolute timestamp by setting the before_stamp to zero so it does not match the write_stamp (which causes an event to use the absolute timestamp). But there's an effort in rb_try_to_discard() to put back the write_stamp to what it was before the event was added. But this is useless and wasteful because nothing is going to be using that write_stamp for calculations as it still will not match the before_stamp. Remove this useless update, and in doing so, we remove another cmpxchg64()! Also update the comments to reflect this change as well as remove some extra white space in another comment. Link: https://lore.kernel.org/linux-trace-kernel/20231215081810.1f4f38fe@rorschach.local.home Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Joel Fernandes <joel@joelfernandes.org> Cc: Vincent Donnefort <vdonnefort@google.com> Fixes: b2dd797543cf ("ring-buffer: Force absolute timestamp on discard of event") Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-12-15 21:18:10 +08:00
*
* By setting the before_stamp to zero, the next event
* is not going to use the write_stamp and will instead
* create an absolute timestamp. This means there's no
* reason to update the wirte_stamp!
*/
rb_time_set(&cpu_buffer->before_stamp, 0);
ring-buffer: Have nested events still record running time stamp Up until now, if an event is interrupted while it is recorded by an interrupt, and that interrupt records events, the time of those events will all be the same. This is because events only record the delta of the time since the previous event (or beginning of a page), and to handle updating the time keeping for that of nested events is extremely racy. After years of thinking about this and several failed attempts, I finally have a solution to solve this puzzle. The problem is that you need to atomically calculate the delta and then update the time stamp you made the delta from, as well as then record it into the buffer, all this while at any time an interrupt can come in and do the same thing. This is easy to solve with heavy weight atomics, but that would be detrimental to the performance of the ring buffer. The current state of affairs sacrificed the time deltas for nested events for performance. The reason for previous failed attempts at solving this puzzle was because I was trying to completely avoid slow atomic operations like cmpxchg. I final came to the conclusion to always avoid cmpxchg is not possible, which is why those previous attempts always failed. But it is possible to pick one path (the most common case) and avoid cmpxchg in that path, which is the "fast path". The most common case is that an event will not be interrupted and have other events added into it. An event can detect if it has interrupted another event, and for these cases we can make it the slow path and use the heavy operations like cmpxchg. One more player was added to the game that made this possible, and that is the "absolute timestamp" (by Tom Zanussi) that allows us to inject a full 59 bit time stamp. (Of course this breaks if a machine is running for more than 18 years without a reboot!). There's barrier() placements around for being paranoid, even when they are not needed because of other atomic functions near by. But those should not hurt, as if they are not needed, they basically become a nop. Note, this also makes the race window much smaller, which means there are less slow paths to slow down the performance. The basic idea is that there's two main paths taken. 1) Not being interrupted between time stamps and reserving buffer space. In this case, the time stamps taken are true to the location in the buffer. 2) Was interrupted by another path between taking time stamps and reserving buffer space. The objective is to know what the delta is from the last reserved location in the buffer. As it is possible to detect if an event is interrupting another event before reserving data, space is added to the length to be reserved to inject a full time stamp along with the event being reserved. When an event is not interrupted, the write stamp is always the time of the last event written to the buffer. In path 1, there's two sub paths we care about: a) The event did not interrupt another event. b) The event interrupted another event. In case a, as the write stamp was read and known to be correct, the delta between the current time stamp and the write stamp is the delta between the current event and the previously recorded event. In case b, extra space was reserved to just put the full time stamp into the buffer. Which is done, as stated, in this path the time stamp taken is known to match the location in the buffer. In path 2, there's also two sub paths we care about: a) The event was not interrupted by another event since it reserved space on the buffer and re-reading the write stamp. b) The event was interrupted by another event. In case a, the write stamp is that of the last event that interrupted this event between taking the time stamps and reserving. As no event came in after re-reading the write stamp, that event is known to be the time of the event directly before this event and the delta can be the new time stamp and the write stamp. In case b, one or more events came in between reserving the event and re-reading he write stamp. Since this event's buffer reservation is between other events at this path, there's no way to know what the delta is. But because an event interrupted this event after it started, its fine to just give a zero delta, and take the same time stamp as the events that happened within the event being recorded. Here's the implementation of the design of this solution: All this is per cpu, and only needs to worry about nested events (not parallel events). The players: write_tail: The index in the buffer where new events can be written to. It is incremented via local_add() to reserve space for a new event. before_stamp: A time stamp set by all events before reserving space. write_stamp: A time stamp updated by events after it has successfully reserved space. /* Save the current position of write */ [A] w = local_read(write_tail); barrier(); /* Read both before and write stamps before touching anything */ before = local_read(before_stamp); after = local_read(write_stamp); barrier(); /* * If before and after are the same, then this event is not * interrupting a time update. If it is, then reserve space for adding * a full time stamp (this can turn into a time extend which is * just an extended time delta but fill up the extra space). */ if (after != before) abs = true; ts = clock(); /* Now update the before_stamp (everyone does this!) */ [B] local_set(before_stamp, ts); /* Now reserve space on the buffer */ [C] write = local_add_return(len, write_tail); /* Set tail to be were this event's data is */ tail = write - len; if (w == tail) { /* Nothing interrupted this between A and C */ [D] local_set(write_stamp, ts); barrier(); [E] save_before = local_read(before_stamp); if (!abs) { /* This did not interrupt a time update */ delta = ts - after; } else { delta = ts; /* The full time stamp will be in use */ } if (ts != save_before) { /* slow path - Was interrupted between C and E */ /* The update to write_stamp could have overwritten the update to * it by the interrupting event, but before and after should be * the same for all completed top events */ after = local_read(write_stamp); if (save_before > after) local_cmpxchg(write_stamp, after, save_before); } } else { /* slow path - Interrupted between A and C */ after = local_read(write_stamp); temp_ts = clock(); barrier(); [F] if (write == local_read(write_tail) && after < temp_ts) { /* This was not interrupted since C and F * The last write_stamp is still valid for the previous event * in the buffer. */ delta = temp_ts - after; /* OK to keep this new time stamp */ ts = temp_ts; } else { /* Interrupted between C and F * Well, there's no use to try to know what the time stamp * is for the previous event. Just set delta to zero and * be the same time as that event that interrupted us before * the reservation of the buffer. */ delta = 0; } /* No need to use full timestamps here */ abs = 0; } Link: https://lkml.kernel.org/r/20200625094454.732790f7@oasis.local.home Link: https://lore.kernel.org/r/20200627010041.517736087@goodmis.org Link: http://lkml.kernel.org/r/20200629025258.957440797@goodmis.org Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:25 +08:00
/*
* If an event were to come in now, it would see that the
* write_stamp and the before_stamp are different, and assume
* that this event just added itself before updating
* the write stamp. The interrupting event will fix the
ring-buffer: Remove useless update to write_stamp in rb_try_to_discard() When filtering is enabled, a temporary buffer is created to place the content of the trace event output so that the filter logic can decide from the trace event output if the trace event should be filtered out or not. If it is to be filtered out, the content in the temporary buffer is simply discarded, otherwise it is written into the trace buffer. But if an interrupt were to come in while a previous event was using that temporary buffer, the event written by the interrupt would actually go into the ring buffer itself to prevent corrupting the data on the temporary buffer. If the event is to be filtered out, the event in the ring buffer is discarded, or if it fails to discard because another event were to have already come in, it is turned into padding. The update to the write_stamp in the rb_try_to_discard() happens after a fix was made to force the next event after the discard to use an absolute timestamp by setting the before_stamp to zero so it does not match the write_stamp (which causes an event to use the absolute timestamp). But there's an effort in rb_try_to_discard() to put back the write_stamp to what it was before the event was added. But this is useless and wasteful because nothing is going to be using that write_stamp for calculations as it still will not match the before_stamp. Remove this useless update, and in doing so, we remove another cmpxchg64()! Also update the comments to reflect this change as well as remove some extra white space in another comment. Link: https://lore.kernel.org/linux-trace-kernel/20231215081810.1f4f38fe@rorschach.local.home Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Joel Fernandes <joel@joelfernandes.org> Cc: Vincent Donnefort <vdonnefort@google.com> Fixes: b2dd797543cf ("ring-buffer: Force absolute timestamp on discard of event") Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-12-15 21:18:10 +08:00
* write stamp for us, and use an absolute timestamp.
ring-buffer: Have nested events still record running time stamp Up until now, if an event is interrupted while it is recorded by an interrupt, and that interrupt records events, the time of those events will all be the same. This is because events only record the delta of the time since the previous event (or beginning of a page), and to handle updating the time keeping for that of nested events is extremely racy. After years of thinking about this and several failed attempts, I finally have a solution to solve this puzzle. The problem is that you need to atomically calculate the delta and then update the time stamp you made the delta from, as well as then record it into the buffer, all this while at any time an interrupt can come in and do the same thing. This is easy to solve with heavy weight atomics, but that would be detrimental to the performance of the ring buffer. The current state of affairs sacrificed the time deltas for nested events for performance. The reason for previous failed attempts at solving this puzzle was because I was trying to completely avoid slow atomic operations like cmpxchg. I final came to the conclusion to always avoid cmpxchg is not possible, which is why those previous attempts always failed. But it is possible to pick one path (the most common case) and avoid cmpxchg in that path, which is the "fast path". The most common case is that an event will not be interrupted and have other events added into it. An event can detect if it has interrupted another event, and for these cases we can make it the slow path and use the heavy operations like cmpxchg. One more player was added to the game that made this possible, and that is the "absolute timestamp" (by Tom Zanussi) that allows us to inject a full 59 bit time stamp. (Of course this breaks if a machine is running for more than 18 years without a reboot!). There's barrier() placements around for being paranoid, even when they are not needed because of other atomic functions near by. But those should not hurt, as if they are not needed, they basically become a nop. Note, this also makes the race window much smaller, which means there are less slow paths to slow down the performance. The basic idea is that there's two main paths taken. 1) Not being interrupted between time stamps and reserving buffer space. In this case, the time stamps taken are true to the location in the buffer. 2) Was interrupted by another path between taking time stamps and reserving buffer space. The objective is to know what the delta is from the last reserved location in the buffer. As it is possible to detect if an event is interrupting another event before reserving data, space is added to the length to be reserved to inject a full time stamp along with the event being reserved. When an event is not interrupted, the write stamp is always the time of the last event written to the buffer. In path 1, there's two sub paths we care about: a) The event did not interrupt another event. b) The event interrupted another event. In case a, as the write stamp was read and known to be correct, the delta between the current time stamp and the write stamp is the delta between the current event and the previously recorded event. In case b, extra space was reserved to just put the full time stamp into the buffer. Which is done, as stated, in this path the time stamp taken is known to match the location in the buffer. In path 2, there's also two sub paths we care about: a) The event was not interrupted by another event since it reserved space on the buffer and re-reading the write stamp. b) The event was interrupted by another event. In case a, the write stamp is that of the last event that interrupted this event between taking the time stamps and reserving. As no event came in after re-reading the write stamp, that event is known to be the time of the event directly before this event and the delta can be the new time stamp and the write stamp. In case b, one or more events came in between reserving the event and re-reading he write stamp. Since this event's buffer reservation is between other events at this path, there's no way to know what the delta is. But because an event interrupted this event after it started, its fine to just give a zero delta, and take the same time stamp as the events that happened within the event being recorded. Here's the implementation of the design of this solution: All this is per cpu, and only needs to worry about nested events (not parallel events). The players: write_tail: The index in the buffer where new events can be written to. It is incremented via local_add() to reserve space for a new event. before_stamp: A time stamp set by all events before reserving space. write_stamp: A time stamp updated by events after it has successfully reserved space. /* Save the current position of write */ [A] w = local_read(write_tail); barrier(); /* Read both before and write stamps before touching anything */ before = local_read(before_stamp); after = local_read(write_stamp); barrier(); /* * If before and after are the same, then this event is not * interrupting a time update. If it is, then reserve space for adding * a full time stamp (this can turn into a time extend which is * just an extended time delta but fill up the extra space). */ if (after != before) abs = true; ts = clock(); /* Now update the before_stamp (everyone does this!) */ [B] local_set(before_stamp, ts); /* Now reserve space on the buffer */ [C] write = local_add_return(len, write_tail); /* Set tail to be were this event's data is */ tail = write - len; if (w == tail) { /* Nothing interrupted this between A and C */ [D] local_set(write_stamp, ts); barrier(); [E] save_before = local_read(before_stamp); if (!abs) { /* This did not interrupt a time update */ delta = ts - after; } else { delta = ts; /* The full time stamp will be in use */ } if (ts != save_before) { /* slow path - Was interrupted between C and E */ /* The update to write_stamp could have overwritten the update to * it by the interrupting event, but before and after should be * the same for all completed top events */ after = local_read(write_stamp); if (save_before > after) local_cmpxchg(write_stamp, after, save_before); } } else { /* slow path - Interrupted between A and C */ after = local_read(write_stamp); temp_ts = clock(); barrier(); [F] if (write == local_read(write_tail) && after < temp_ts) { /* This was not interrupted since C and F * The last write_stamp is still valid for the previous event * in the buffer. */ delta = temp_ts - after; /* OK to keep this new time stamp */ ts = temp_ts; } else { /* Interrupted between C and F * Well, there's no use to try to know what the time stamp * is for the previous event. Just set delta to zero and * be the same time as that event that interrupted us before * the reservation of the buffer. */ delta = 0; } /* No need to use full timestamps here */ abs = 0; } Link: https://lkml.kernel.org/r/20200625094454.732790f7@oasis.local.home Link: https://lore.kernel.org/r/20200627010041.517736087@goodmis.org Link: http://lkml.kernel.org/r/20200629025258.957440797@goodmis.org Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:25 +08:00
*/
/*
* This is on the tail page. It is possible that
* a write could come in and move the tail page
* and write to the next page. That is fine
* because we just shorten what is on this page.
*/
old_index += write_mask;
new_index += write_mask;
/* caution: old_index gets updated on cmpxchg failure */
if (local_try_cmpxchg(&bpage->write, &old_index, new_index)) {
/* update counters */
local_sub(event_length, &cpu_buffer->entries_bytes);
return true;
}
}
/* could not discard */
return false;
}
static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
{
local_inc(&cpu_buffer->committing);
local_inc(&cpu_buffer->commits);
}
static __always_inline void
rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
{
unsigned long max_count;
/*
* We only race with interrupts and NMIs on this CPU.
* If we own the commit event, then we can commit
* all others that interrupted us, since the interruptions
* are in stack format (they finish before they come
* back to us). This allows us to do a simple loop to
* assign the commit to the tail.
*/
again:
max_count = cpu_buffer->nr_pages * 100;
while (cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page)) {
if (RB_WARN_ON(cpu_buffer, !(--max_count)))
return;
if (RB_WARN_ON(cpu_buffer,
rb_is_reader_page(cpu_buffer->tail_page)))
return;
ring-buffer: Fix race while reader and writer are on the same page When user reads file 'trace_pipe', kernel keeps printing following logs that warn at "cpu_buffer->reader_page->read > rb_page_size(reader)" in rb_get_reader_page(). It just looks like there's an infinite loop in tracing_read_pipe(). This problem occurs several times on arm64 platform when testing v5.10 and below. Call trace: rb_get_reader_page+0x248/0x1300 rb_buffer_peek+0x34/0x160 ring_buffer_peek+0xbc/0x224 peek_next_entry+0x98/0xbc __find_next_entry+0xc4/0x1c0 trace_find_next_entry_inc+0x30/0x94 tracing_read_pipe+0x198/0x304 vfs_read+0xb4/0x1e0 ksys_read+0x74/0x100 __arm64_sys_read+0x24/0x30 el0_svc_common.constprop.0+0x7c/0x1bc do_el0_svc+0x2c/0x94 el0_svc+0x20/0x30 el0_sync_handler+0xb0/0xb4 el0_sync+0x160/0x180 Then I dump the vmcore and look into the problematic per_cpu ring_buffer, I found that tail_page/commit_page/reader_page are on the same page while reader_page->read is obviously abnormal: tail_page == commit_page == reader_page == { .write = 0x100d20, .read = 0x8f9f4805, // Far greater than 0xd20, obviously abnormal!!! .entries = 0x10004c, .real_end = 0x0, .page = { .time_stamp = 0x857257416af0, .commit = 0xd20, // This page hasn't been full filled. // .data[0...0xd20] seems normal. } } The root cause is most likely the race that reader and writer are on the same page while reader saw an event that not fully committed by writer. To fix this, add memory barriers to make sure the reader can see the content of what is committed. Since commit a0fcaaed0c46 ("ring-buffer: Fix race between reset page and reading page") has added the read barrier in rb_get_reader_page(), here we just need to add the write barrier. Link: https://lore.kernel.org/linux-trace-kernel/20230325021247.2923907-1-zhengyejian1@huawei.com Cc: stable@vger.kernel.org Fixes: 77ae365eca89 ("ring-buffer: make lockless") Suggested-by: Steven Rostedt (Google) <rostedt@goodmis.org> Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-03-25 10:12:47 +08:00
/*
* No need for a memory barrier here, as the update
* of the tail_page did it for this page.
*/
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
rb_inc_page(&cpu_buffer->commit_page);
/* add barrier to keep gcc from optimizing too much */
barrier();
}
while (rb_commit_index(cpu_buffer) !=
rb_page_write(cpu_buffer->commit_page)) {
ring-buffer: Fix race while reader and writer are on the same page When user reads file 'trace_pipe', kernel keeps printing following logs that warn at "cpu_buffer->reader_page->read > rb_page_size(reader)" in rb_get_reader_page(). It just looks like there's an infinite loop in tracing_read_pipe(). This problem occurs several times on arm64 platform when testing v5.10 and below. Call trace: rb_get_reader_page+0x248/0x1300 rb_buffer_peek+0x34/0x160 ring_buffer_peek+0xbc/0x224 peek_next_entry+0x98/0xbc __find_next_entry+0xc4/0x1c0 trace_find_next_entry_inc+0x30/0x94 tracing_read_pipe+0x198/0x304 vfs_read+0xb4/0x1e0 ksys_read+0x74/0x100 __arm64_sys_read+0x24/0x30 el0_svc_common.constprop.0+0x7c/0x1bc do_el0_svc+0x2c/0x94 el0_svc+0x20/0x30 el0_sync_handler+0xb0/0xb4 el0_sync+0x160/0x180 Then I dump the vmcore and look into the problematic per_cpu ring_buffer, I found that tail_page/commit_page/reader_page are on the same page while reader_page->read is obviously abnormal: tail_page == commit_page == reader_page == { .write = 0x100d20, .read = 0x8f9f4805, // Far greater than 0xd20, obviously abnormal!!! .entries = 0x10004c, .real_end = 0x0, .page = { .time_stamp = 0x857257416af0, .commit = 0xd20, // This page hasn't been full filled. // .data[0...0xd20] seems normal. } } The root cause is most likely the race that reader and writer are on the same page while reader saw an event that not fully committed by writer. To fix this, add memory barriers to make sure the reader can see the content of what is committed. Since commit a0fcaaed0c46 ("ring-buffer: Fix race between reset page and reading page") has added the read barrier in rb_get_reader_page(), here we just need to add the write barrier. Link: https://lore.kernel.org/linux-trace-kernel/20230325021247.2923907-1-zhengyejian1@huawei.com Cc: stable@vger.kernel.org Fixes: 77ae365eca89 ("ring-buffer: make lockless") Suggested-by: Steven Rostedt (Google) <rostedt@goodmis.org> Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-03-25 10:12:47 +08:00
/* Make sure the readers see the content of what is committed. */
smp_wmb();
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
RB_WARN_ON(cpu_buffer,
local_read(&cpu_buffer->commit_page->page->commit) &
~RB_WRITE_MASK);
barrier();
}
/* again, keep gcc from optimizing */
barrier();
/*
* If an interrupt came in just after the first while loop
* and pushed the tail page forward, we will be left with
* a dangling commit that will never go forward.
*/
if (unlikely(cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page)))
goto again;
}
static __always_inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
{
unsigned long commits;
if (RB_WARN_ON(cpu_buffer,
!local_read(&cpu_buffer->committing)))
return;
again:
commits = local_read(&cpu_buffer->commits);
/* synchronize with interrupts */
barrier();
if (local_read(&cpu_buffer->committing) == 1)
rb_set_commit_to_write(cpu_buffer);
local_dec(&cpu_buffer->committing);
/* synchronize with interrupts */
barrier();
/*
* Need to account for interrupts coming in between the
* updating of the commit page and the clearing of the
* committing counter.
*/
if (unlikely(local_read(&cpu_buffer->commits) != commits) &&
!local_read(&cpu_buffer->committing)) {
local_inc(&cpu_buffer->committing);
goto again;
}
}
static inline void rb_event_discard(struct ring_buffer_event *event)
{
if (extended_time(event))
event = skip_time_extend(event);
/* array[0] holds the actual length for the discarded event */
event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
event->type_len = RINGBUF_TYPE_PADDING;
/* time delta must be non zero */
if (!event->time_delta)
event->time_delta = 1;
}
static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer)
{
local_inc(&cpu_buffer->entries);
rb_end_commit(cpu_buffer);
}
static __always_inline void
rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
{
if (buffer->irq_work.waiters_pending) {
buffer->irq_work.waiters_pending = false;
/* irq_work_queue() supplies it's own memory barriers */
irq_work_queue(&buffer->irq_work.work);
}
if (cpu_buffer->irq_work.waiters_pending) {
cpu_buffer->irq_work.waiters_pending = false;
/* irq_work_queue() supplies it's own memory barriers */
irq_work_queue(&cpu_buffer->irq_work.work);
}
if (cpu_buffer->last_pages_touch == local_read(&cpu_buffer->pages_touched))
return;
if (cpu_buffer->reader_page == cpu_buffer->commit_page)
return;
if (!cpu_buffer->irq_work.full_waiters_pending)
return;
cpu_buffer->last_pages_touch = local_read(&cpu_buffer->pages_touched);
if (!full_hit(buffer, cpu_buffer->cpu, cpu_buffer->shortest_full))
return;
cpu_buffer->irq_work.wakeup_full = true;
cpu_buffer->irq_work.full_waiters_pending = false;
/* irq_work_queue() supplies it's own memory barriers */
irq_work_queue(&cpu_buffer->irq_work.work);
}
#ifdef CONFIG_RING_BUFFER_RECORD_RECURSION
# define do_ring_buffer_record_recursion() \
do_ftrace_record_recursion(_THIS_IP_, _RET_IP_)
#else
# define do_ring_buffer_record_recursion() do { } while (0)
#endif
/*
* The lock and unlock are done within a preempt disable section.
* The current_context per_cpu variable can only be modified
* by the current task between lock and unlock. But it can
2018-01-15 23:47:09 +08:00
* be modified more than once via an interrupt. To pass this
* information from the lock to the unlock without having to
* access the 'in_interrupt()' functions again (which do show
* a bit of overhead in something as critical as function tracing,
* we use a bitmask trick.
*
* bit 1 = NMI context
* bit 2 = IRQ context
* bit 3 = SoftIRQ context
* bit 4 = normal context.
*
2018-01-15 23:47:09 +08:00
* This works because this is the order of contexts that can
* preempt other contexts. A SoftIRQ never preempts an IRQ
* context.
*
* When the context is determined, the corresponding bit is
* checked and set (if it was set, then a recursion of that context
* happened).
*
* On unlock, we need to clear this bit. To do so, just subtract
* 1 from the current_context and AND it to itself.
*
* (binary)
* 101 - 1 = 100
* 101 & 100 = 100 (clearing bit zero)
*
* 1010 - 1 = 1001
* 1010 & 1001 = 1000 (clearing bit 1)
*
* The least significant bit can be cleared this way, and it
* just so happens that it is the same bit corresponding to
* the current context.
*
* Now the TRANSITION bit breaks the above slightly. The TRANSITION bit
* is set when a recursion is detected at the current context, and if
* the TRANSITION bit is already set, it will fail the recursion.
* This is needed because there's a lag between the changing of
* interrupt context and updating the preempt count. In this case,
* a false positive will be found. To handle this, one extra recursion
* is allowed, and this is done by the TRANSITION bit. If the TRANSITION
* bit is already set, then it is considered a recursion and the function
* ends. Otherwise, the TRANSITION bit is set, and that bit is returned.
*
* On the trace_recursive_unlock(), the TRANSITION bit will be the first
* to be cleared. Even if it wasn't the context that set it. That is,
* if an interrupt comes in while NORMAL bit is set and the ring buffer
* is called before preempt_count() is updated, since the check will
* be on the NORMAL bit, the TRANSITION bit will then be set. If an
* NMI then comes in, it will set the NMI bit, but when the NMI code
* does the trace_recursive_unlock() it will clear the TRANSITION bit
* and leave the NMI bit set. But this is fine, because the interrupt
* code that set the TRANSITION bit will then clear the NMI bit when it
* calls trace_recursive_unlock(). If another NMI comes in, it will
* set the TRANSITION bit and continue.
*
* Note: The TRANSITION bit only handles a single transition between context.
*/
static __always_inline bool
trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
{
2018-01-15 23:47:09 +08:00
unsigned int val = cpu_buffer->current_context;
int bit = interrupt_context_level();
bit = RB_CTX_NORMAL - bit;
2018-01-15 23:47:09 +08:00
if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) {
/*
* It is possible that this was called by transitioning
* between interrupt context, and preempt_count() has not
* been updated yet. In this case, use the TRANSITION bit.
*/
bit = RB_CTX_TRANSITION;
if (val & (1 << (bit + cpu_buffer->nest))) {
do_ring_buffer_record_recursion();
return true;
}
}
val |= (1 << (bit + cpu_buffer->nest));
2018-01-15 23:47:09 +08:00
cpu_buffer->current_context = val;
return false;
}
static __always_inline void
trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
{
cpu_buffer->current_context &=
cpu_buffer->current_context - (1 << cpu_buffer->nest);
}
/* The recursive locking above uses 5 bits */
#define NESTED_BITS 5
/**
* ring_buffer_nest_start - Allow to trace while nested
* @buffer: The ring buffer to modify
*
* The ring buffer has a safety mechanism to prevent recursion.
* But there may be a case where a trace needs to be done while
* tracing something else. In this case, calling this function
* will allow this function to nest within a currently active
* ring_buffer_lock_reserve().
*
* Call this function before calling another ring_buffer_lock_reserve() and
* call ring_buffer_nest_end() after the nested ring_buffer_unlock_commit().
*/
void ring_buffer_nest_start(struct trace_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
int cpu;
/* Enabled by ring_buffer_nest_end() */
preempt_disable_notrace();
cpu = raw_smp_processor_id();
cpu_buffer = buffer->buffers[cpu];
/* This is the shift value for the above recursive locking */
cpu_buffer->nest += NESTED_BITS;
}
/**
* ring_buffer_nest_end - Allow to trace while nested
* @buffer: The ring buffer to modify
*
* Must be called after ring_buffer_nest_start() and after the
* ring_buffer_unlock_commit().
*/
void ring_buffer_nest_end(struct trace_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
int cpu;
/* disabled by ring_buffer_nest_start() */
cpu = raw_smp_processor_id();
cpu_buffer = buffer->buffers[cpu];
/* This is the shift value for the above recursive locking */
cpu_buffer->nest -= NESTED_BITS;
preempt_enable_notrace();
}
/**
* ring_buffer_unlock_commit - commit a reserved
* @buffer: The buffer to commit to
*
* This commits the data to the ring buffer, and releases any locks held.
*
* Must be paired with ring_buffer_lock_reserve.
*/
int ring_buffer_unlock_commit(struct trace_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
int cpu = raw_smp_processor_id();
cpu_buffer = buffer->buffers[cpu];
rb_commit(cpu_buffer);
rb_wakeups(buffer, cpu_buffer);
trace_recursive_unlock(cpu_buffer);
preempt_enable_notrace();
return 0;
}
EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
/* Special value to validate all deltas on a page. */
#define CHECK_FULL_PAGE 1L
#ifdef CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS
static void dump_buffer_page(struct buffer_data_page *bpage,
struct rb_event_info *info,
unsigned long tail)
{
struct ring_buffer_event *event;
u64 ts, delta;
int e;
ts = bpage->time_stamp;
pr_warn(" [%lld] PAGE TIME STAMP\n", ts);
for (e = 0; e < tail; e += rb_event_length(event)) {
event = (struct ring_buffer_event *)(bpage->data + e);
switch (event->type_len) {
case RINGBUF_TYPE_TIME_EXTEND:
delta = rb_event_time_stamp(event);
ts += delta;
pr_warn(" [%lld] delta:%lld TIME EXTEND\n", ts, delta);
break;
case RINGBUF_TYPE_TIME_STAMP:
delta = rb_event_time_stamp(event);
ts = rb_fix_abs_ts(delta, ts);
pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta);
break;
case RINGBUF_TYPE_PADDING:
ts += event->time_delta;
pr_warn(" [%lld] delta:%d PADDING\n", ts, event->time_delta);
break;
case RINGBUF_TYPE_DATA:
ts += event->time_delta;
pr_warn(" [%lld] delta:%d\n", ts, event->time_delta);
break;
default:
break;
}
}
}
static DEFINE_PER_CPU(atomic_t, checking);
static atomic_t ts_dump;
/*
* Check if the current event time stamp matches the deltas on
* the buffer page.
*/
static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
struct rb_event_info *info,
unsigned long tail)
{
struct ring_buffer_event *event;
struct buffer_data_page *bpage;
u64 ts, delta;
bool full = false;
int e;
bpage = info->tail_page->page;
if (tail == CHECK_FULL_PAGE) {
full = true;
tail = local_read(&bpage->commit);
} else if (info->add_timestamp &
(RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)) {
/* Ignore events with absolute time stamps */
return;
}
/*
* Do not check the first event (skip possible extends too).
* Also do not check if previous events have not been committed.
*/
if (tail <= 8 || tail > local_read(&bpage->commit))
return;
/*
ring-buffer: Remove useless update to write_stamp in rb_try_to_discard() When filtering is enabled, a temporary buffer is created to place the content of the trace event output so that the filter logic can decide from the trace event output if the trace event should be filtered out or not. If it is to be filtered out, the content in the temporary buffer is simply discarded, otherwise it is written into the trace buffer. But if an interrupt were to come in while a previous event was using that temporary buffer, the event written by the interrupt would actually go into the ring buffer itself to prevent corrupting the data on the temporary buffer. If the event is to be filtered out, the event in the ring buffer is discarded, or if it fails to discard because another event were to have already come in, it is turned into padding. The update to the write_stamp in the rb_try_to_discard() happens after a fix was made to force the next event after the discard to use an absolute timestamp by setting the before_stamp to zero so it does not match the write_stamp (which causes an event to use the absolute timestamp). But there's an effort in rb_try_to_discard() to put back the write_stamp to what it was before the event was added. But this is useless and wasteful because nothing is going to be using that write_stamp for calculations as it still will not match the before_stamp. Remove this useless update, and in doing so, we remove another cmpxchg64()! Also update the comments to reflect this change as well as remove some extra white space in another comment. Link: https://lore.kernel.org/linux-trace-kernel/20231215081810.1f4f38fe@rorschach.local.home Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Joel Fernandes <joel@joelfernandes.org> Cc: Vincent Donnefort <vdonnefort@google.com> Fixes: b2dd797543cf ("ring-buffer: Force absolute timestamp on discard of event") Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-12-15 21:18:10 +08:00
* If this interrupted another event,
*/
if (atomic_inc_return(this_cpu_ptr(&checking)) != 1)
goto out;
ts = bpage->time_stamp;
for (e = 0; e < tail; e += rb_event_length(event)) {
event = (struct ring_buffer_event *)(bpage->data + e);
switch (event->type_len) {
case RINGBUF_TYPE_TIME_EXTEND:
delta = rb_event_time_stamp(event);
ts += delta;
break;
case RINGBUF_TYPE_TIME_STAMP:
delta = rb_event_time_stamp(event);
ts = rb_fix_abs_ts(delta, ts);
break;
case RINGBUF_TYPE_PADDING:
if (event->time_delta == 1)
break;
fallthrough;
case RINGBUF_TYPE_DATA:
ts += event->time_delta;
break;
default:
RB_WARN_ON(cpu_buffer, 1);
}
}
if ((full && ts > info->ts) ||
(!full && ts + info->delta != info->ts)) {
/* If another report is happening, ignore this one */
if (atomic_inc_return(&ts_dump) != 1) {
atomic_dec(&ts_dump);
goto out;
}
atomic_inc(&cpu_buffer->record_disabled);
/* There's some cases in boot up that this can happen */
WARN_ON_ONCE(system_state != SYSTEM_BOOTING);
pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s\n",
cpu_buffer->cpu,
ts + info->delta, info->ts, info->delta,
info->before, info->after,
full ? " (full)" : "");
dump_buffer_page(bpage, info, tail);
atomic_dec(&ts_dump);
/* Do not re-enable checking */
return;
}
out:
atomic_dec(this_cpu_ptr(&checking));
}
#else
static inline void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
struct rb_event_info *info,
unsigned long tail)
{
}
#endif /* CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS */
static struct ring_buffer_event *
__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
struct rb_event_info *info)
{
struct ring_buffer_event *event;
struct buffer_page *tail_page;
ring-buffer: Have nested events still record running time stamp Up until now, if an event is interrupted while it is recorded by an interrupt, and that interrupt records events, the time of those events will all be the same. This is because events only record the delta of the time since the previous event (or beginning of a page), and to handle updating the time keeping for that of nested events is extremely racy. After years of thinking about this and several failed attempts, I finally have a solution to solve this puzzle. The problem is that you need to atomically calculate the delta and then update the time stamp you made the delta from, as well as then record it into the buffer, all this while at any time an interrupt can come in and do the same thing. This is easy to solve with heavy weight atomics, but that would be detrimental to the performance of the ring buffer. The current state of affairs sacrificed the time deltas for nested events for performance. The reason for previous failed attempts at solving this puzzle was because I was trying to completely avoid slow atomic operations like cmpxchg. I final came to the conclusion to always avoid cmpxchg is not possible, which is why those previous attempts always failed. But it is possible to pick one path (the most common case) and avoid cmpxchg in that path, which is the "fast path". The most common case is that an event will not be interrupted and have other events added into it. An event can detect if it has interrupted another event, and for these cases we can make it the slow path and use the heavy operations like cmpxchg. One more player was added to the game that made this possible, and that is the "absolute timestamp" (by Tom Zanussi) that allows us to inject a full 59 bit time stamp. (Of course this breaks if a machine is running for more than 18 years without a reboot!). There's barrier() placements around for being paranoid, even when they are not needed because of other atomic functions near by. But those should not hurt, as if they are not needed, they basically become a nop. Note, this also makes the race window much smaller, which means there are less slow paths to slow down the performance. The basic idea is that there's two main paths taken. 1) Not being interrupted between time stamps and reserving buffer space. In this case, the time stamps taken are true to the location in the buffer. 2) Was interrupted by another path between taking time stamps and reserving buffer space. The objective is to know what the delta is from the last reserved location in the buffer. As it is possible to detect if an event is interrupting another event before reserving data, space is added to the length to be reserved to inject a full time stamp along with the event being reserved. When an event is not interrupted, the write stamp is always the time of the last event written to the buffer. In path 1, there's two sub paths we care about: a) The event did not interrupt another event. b) The event interrupted another event. In case a, as the write stamp was read and known to be correct, the delta between the current time stamp and the write stamp is the delta between the current event and the previously recorded event. In case b, extra space was reserved to just put the full time stamp into the buffer. Which is done, as stated, in this path the time stamp taken is known to match the location in the buffer. In path 2, there's also two sub paths we care about: a) The event was not interrupted by another event since it reserved space on the buffer and re-reading the write stamp. b) The event was interrupted by another event. In case a, the write stamp is that of the last event that interrupted this event between taking the time stamps and reserving. As no event came in after re-reading the write stamp, that event is known to be the time of the event directly before this event and the delta can be the new time stamp and the write stamp. In case b, one or more events came in between reserving the event and re-reading he write stamp. Since this event's buffer reservation is between other events at this path, there's no way to know what the delta is. But because an event interrupted this event after it started, its fine to just give a zero delta, and take the same time stamp as the events that happened within the event being recorded. Here's the implementation of the design of this solution: All this is per cpu, and only needs to worry about nested events (not parallel events). The players: write_tail: The index in the buffer where new events can be written to. It is incremented via local_add() to reserve space for a new event. before_stamp: A time stamp set by all events before reserving space. write_stamp: A time stamp updated by events after it has successfully reserved space. /* Save the current position of write */ [A] w = local_read(write_tail); barrier(); /* Read both before and write stamps before touching anything */ before = local_read(before_stamp); after = local_read(write_stamp); barrier(); /* * If before and after are the same, then this event is not * interrupting a time update. If it is, then reserve space for adding * a full time stamp (this can turn into a time extend which is * just an extended time delta but fill up the extra space). */ if (after != before) abs = true; ts = clock(); /* Now update the before_stamp (everyone does this!) */ [B] local_set(before_stamp, ts); /* Now reserve space on the buffer */ [C] write = local_add_return(len, write_tail); /* Set tail to be were this event's data is */ tail = write - len; if (w == tail) { /* Nothing interrupted this between A and C */ [D] local_set(write_stamp, ts); barrier(); [E] save_before = local_read(before_stamp); if (!abs) { /* This did not interrupt a time update */ delta = ts - after; } else { delta = ts; /* The full time stamp will be in use */ } if (ts != save_before) { /* slow path - Was interrupted between C and E */ /* The update to write_stamp could have overwritten the update to * it by the interrupting event, but before and after should be * the same for all completed top events */ after = local_read(write_stamp); if (save_before > after) local_cmpxchg(write_stamp, after, save_before); } } else { /* slow path - Interrupted between A and C */ after = local_read(write_stamp); temp_ts = clock(); barrier(); [F] if (write == local_read(write_tail) && after < temp_ts) { /* This was not interrupted since C and F * The last write_stamp is still valid for the previous event * in the buffer. */ delta = temp_ts - after; /* OK to keep this new time stamp */ ts = temp_ts; } else { /* Interrupted between C and F * Well, there's no use to try to know what the time stamp * is for the previous event. Just set delta to zero and * be the same time as that event that interrupted us before * the reservation of the buffer. */ delta = 0; } /* No need to use full timestamps here */ abs = 0; } Link: https://lkml.kernel.org/r/20200625094454.732790f7@oasis.local.home Link: https://lore.kernel.org/r/20200627010041.517736087@goodmis.org Link: http://lkml.kernel.org/r/20200629025258.957440797@goodmis.org Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:25 +08:00
unsigned long tail, write, w;
ring-buffer: Add rb_time_t 64 bit operations for speeding up 32 bit After a discussion with the new time algorithm to have nested events still have proper time keeping but required using local64_t atomic operations. Mathieu was concerned about the performance this would have on 32 bit machines, as in most cases, atomic 64 bit operations on them can be expensive. As the ring buffer's timing needs do not require full features of local64_t, a wrapper is made to implement a new rb_time_t operation that uses two longs on 32 bit machines but still uses the local64_t operations on 64 bit machines. There's a switch that can be made in the file to force 64 bit to use the 32 bit version just for testing purposes. All reads do not need to succeed if a read happened while the stamp being read is in the process of being updated. The requirement is that all reads must succed that were done by an interrupting event (where this event was interrupted by another event that did the write). Or if the event itself did the write first. That is: rb_time_set(t, x) followed by rb_time_read(t) will always succeed (even if it gets interrupted by another event that writes to t. The result of the read will be either the previous set, or a set performed by an interrupting event. If the read is done by an event that interrupted another event that was in the process of setting the time stamp, and no other event came along to write to that time stamp, it will fail and the rb_time_read() will return that it failed (the value to read will be undefined). A set will always write to the time stamp and return with a valid time stamp, such that any read after it will be valid. A cmpxchg may fail if it interrupted an event that was in the process of updating the time stamp just like the reads do. Other than that, it will act like a normal cmpxchg. The way this works is that the rb_time_t is made of of three fields. A cnt, that gets updated atomically everyting a modification is made. A top that represents the most significant 30 bits of the time, and a bottom to represent the least significant 30 bits of the time. Notice, that the time values is only 60 bits long (where the ring buffer only uses 59 bits, which gives us 18 years of nanoseconds!). The top two bits of both the top and bottom is a 2 bit counter that gets set by the value of the least two significant bits of the cnt. A read of the top and the bottom where both the top and bottom have the same most significant top 2 bits, are considered a match and a valid 60 bit number can be created from it. If they do not match, then the number is considered invalid, and this must only happen if an event interrupted another event in the midst of updating the time stamp. This is only used for 32 bits machines as 64 bit machines can get better performance out of the local64_t. This has been tested heavily by forcing 64 bit to use this logic. Link: https://lore.kernel.org/r/20200625225345.18cf5881@oasis.local.home Link: http://lkml.kernel.org/r/20200629025259.309232719@goodmis.org Inspired-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:27 +08:00
bool a_ok;
bool b_ok;
ring-buffer: Bind time extend and data events together When the time between two timestamps is greater than 2^27 nanosecs (~134 ms) a time extend event is added that extends the time difference to 59 bits (~18 years). This is due to events only having a 27 bit field to store time. Currently this time extend is a separate event. We add it just before the event data that is being written to the buffer. But before the event data is committed, the event data can also be discarded (as with the case of filters). But because the time extend has already been committed, it will stay in the buffer. If lots of events are being filtered and no event is being written, then every 134ms a time extend can be added to the buffer without any data attached. To keep from filling the entire buffer with time extends, a time extend will never be the first event in a page because the page timestamp can be used. Time extends can only fill the rest of a page with some data at the beginning. This patch binds the time extend with the data. The difference here is that the time extend is not committed before the data is added. Instead, when a time extend is needed, the space reserved on the ring buffer is the time extend + the data event size. The time extend is added to the first part of the reserved block and the data is added to the second. The time extend event is passed back to the reserver, but since the reserver also uses a function to find the data portion of the reserved block, no changes to the ring buffer interface need to be made. When a commit is discarded, we now remove both the time extend and the event. With this approach no more than one time extend can be in the buffer in a row. Data must always follow a time extend. Thanks to Mathieu Desnoyers for suggesting this idea. Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-10-08 06:18:05 +08:00
/* Don't let the compiler play games with cpu_buffer->tail_page */
tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page);
ring-buffer: Have nested events still record running time stamp Up until now, if an event is interrupted while it is recorded by an interrupt, and that interrupt records events, the time of those events will all be the same. This is because events only record the delta of the time since the previous event (or beginning of a page), and to handle updating the time keeping for that of nested events is extremely racy. After years of thinking about this and several failed attempts, I finally have a solution to solve this puzzle. The problem is that you need to atomically calculate the delta and then update the time stamp you made the delta from, as well as then record it into the buffer, all this while at any time an interrupt can come in and do the same thing. This is easy to solve with heavy weight atomics, but that would be detrimental to the performance of the ring buffer. The current state of affairs sacrificed the time deltas for nested events for performance. The reason for previous failed attempts at solving this puzzle was because I was trying to completely avoid slow atomic operations like cmpxchg. I final came to the conclusion to always avoid cmpxchg is not possible, which is why those previous attempts always failed. But it is possible to pick one path (the most common case) and avoid cmpxchg in that path, which is the "fast path". The most common case is that an event will not be interrupted and have other events added into it. An event can detect if it has interrupted another event, and for these cases we can make it the slow path and use the heavy operations like cmpxchg. One more player was added to the game that made this possible, and that is the "absolute timestamp" (by Tom Zanussi) that allows us to inject a full 59 bit time stamp. (Of course this breaks if a machine is running for more than 18 years without a reboot!). There's barrier() placements around for being paranoid, even when they are not needed because of other atomic functions near by. But those should not hurt, as if they are not needed, they basically become a nop. Note, this also makes the race window much smaller, which means there are less slow paths to slow down the performance. The basic idea is that there's two main paths taken. 1) Not being interrupted between time stamps and reserving buffer space. In this case, the time stamps taken are true to the location in the buffer. 2) Was interrupted by another path between taking time stamps and reserving buffer space. The objective is to know what the delta is from the last reserved location in the buffer. As it is possible to detect if an event is interrupting another event before reserving data, space is added to the length to be reserved to inject a full time stamp along with the event being reserved. When an event is not interrupted, the write stamp is always the time of the last event written to the buffer. In path 1, there's two sub paths we care about: a) The event did not interrupt another event. b) The event interrupted another event. In case a, as the write stamp was read and known to be correct, the delta between the current time stamp and the write stamp is the delta between the current event and the previously recorded event. In case b, extra space was reserved to just put the full time stamp into the buffer. Which is done, as stated, in this path the time stamp taken is known to match the location in the buffer. In path 2, there's also two sub paths we care about: a) The event was not interrupted by another event since it reserved space on the buffer and re-reading the write stamp. b) The event was interrupted by another event. In case a, the write stamp is that of the last event that interrupted this event between taking the time stamps and reserving. As no event came in after re-reading the write stamp, that event is known to be the time of the event directly before this event and the delta can be the new time stamp and the write stamp. In case b, one or more events came in between reserving the event and re-reading he write stamp. Since this event's buffer reservation is between other events at this path, there's no way to know what the delta is. But because an event interrupted this event after it started, its fine to just give a zero delta, and take the same time stamp as the events that happened within the event being recorded. Here's the implementation of the design of this solution: All this is per cpu, and only needs to worry about nested events (not parallel events). The players: write_tail: The index in the buffer where new events can be written to. It is incremented via local_add() to reserve space for a new event. before_stamp: A time stamp set by all events before reserving space. write_stamp: A time stamp updated by events after it has successfully reserved space. /* Save the current position of write */ [A] w = local_read(write_tail); barrier(); /* Read both before and write stamps before touching anything */ before = local_read(before_stamp); after = local_read(write_stamp); barrier(); /* * If before and after are the same, then this event is not * interrupting a time update. If it is, then reserve space for adding * a full time stamp (this can turn into a time extend which is * just an extended time delta but fill up the extra space). */ if (after != before) abs = true; ts = clock(); /* Now update the before_stamp (everyone does this!) */ [B] local_set(before_stamp, ts); /* Now reserve space on the buffer */ [C] write = local_add_return(len, write_tail); /* Set tail to be were this event's data is */ tail = write - len; if (w == tail) { /* Nothing interrupted this between A and C */ [D] local_set(write_stamp, ts); barrier(); [E] save_before = local_read(before_stamp); if (!abs) { /* This did not interrupt a time update */ delta = ts - after; } else { delta = ts; /* The full time stamp will be in use */ } if (ts != save_before) { /* slow path - Was interrupted between C and E */ /* The update to write_stamp could have overwritten the update to * it by the interrupting event, but before and after should be * the same for all completed top events */ after = local_read(write_stamp); if (save_before > after) local_cmpxchg(write_stamp, after, save_before); } } else { /* slow path - Interrupted between A and C */ after = local_read(write_stamp); temp_ts = clock(); barrier(); [F] if (write == local_read(write_tail) && after < temp_ts) { /* This was not interrupted since C and F * The last write_stamp is still valid for the previous event * in the buffer. */ delta = temp_ts - after; /* OK to keep this new time stamp */ ts = temp_ts; } else { /* Interrupted between C and F * Well, there's no use to try to know what the time stamp * is for the previous event. Just set delta to zero and * be the same time as that event that interrupted us before * the reservation of the buffer. */ delta = 0; } /* No need to use full timestamps here */ abs = 0; } Link: https://lkml.kernel.org/r/20200625094454.732790f7@oasis.local.home Link: https://lore.kernel.org/r/20200627010041.517736087@goodmis.org Link: http://lkml.kernel.org/r/20200629025258.957440797@goodmis.org Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:25 +08:00
/*A*/ w = local_read(&tail_page->write) & RB_WRITE_MASK;
barrier();
b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
ring-buffer: Have nested events still record running time stamp Up until now, if an event is interrupted while it is recorded by an interrupt, and that interrupt records events, the time of those events will all be the same. This is because events only record the delta of the time since the previous event (or beginning of a page), and to handle updating the time keeping for that of nested events is extremely racy. After years of thinking about this and several failed attempts, I finally have a solution to solve this puzzle. The problem is that you need to atomically calculate the delta and then update the time stamp you made the delta from, as well as then record it into the buffer, all this while at any time an interrupt can come in and do the same thing. This is easy to solve with heavy weight atomics, but that would be detrimental to the performance of the ring buffer. The current state of affairs sacrificed the time deltas for nested events for performance. The reason for previous failed attempts at solving this puzzle was because I was trying to completely avoid slow atomic operations like cmpxchg. I final came to the conclusion to always avoid cmpxchg is not possible, which is why those previous attempts always failed. But it is possible to pick one path (the most common case) and avoid cmpxchg in that path, which is the "fast path". The most common case is that an event will not be interrupted and have other events added into it. An event can detect if it has interrupted another event, and for these cases we can make it the slow path and use the heavy operations like cmpxchg. One more player was added to the game that made this possible, and that is the "absolute timestamp" (by Tom Zanussi) that allows us to inject a full 59 bit time stamp. (Of course this breaks if a machine is running for more than 18 years without a reboot!). There's barrier() placements around for being paranoid, even when they are not needed because of other atomic functions near by. But those should not hurt, as if they are not needed, they basically become a nop. Note, this also makes the race window much smaller, which means there are less slow paths to slow down the performance. The basic idea is that there's two main paths taken. 1) Not being interrupted between time stamps and reserving buffer space. In this case, the time stamps taken are true to the location in the buffer. 2) Was interrupted by another path between taking time stamps and reserving buffer space. The objective is to know what the delta is from the last reserved location in the buffer. As it is possible to detect if an event is interrupting another event before reserving data, space is added to the length to be reserved to inject a full time stamp along with the event being reserved. When an event is not interrupted, the write stamp is always the time of the last event written to the buffer. In path 1, there's two sub paths we care about: a) The event did not interrupt another event. b) The event interrupted another event. In case a, as the write stamp was read and known to be correct, the delta between the current time stamp and the write stamp is the delta between the current event and the previously recorded event. In case b, extra space was reserved to just put the full time stamp into the buffer. Which is done, as stated, in this path the time stamp taken is known to match the location in the buffer. In path 2, there's also two sub paths we care about: a) The event was not interrupted by another event since it reserved space on the buffer and re-reading the write stamp. b) The event was interrupted by another event. In case a, the write stamp is that of the last event that interrupted this event between taking the time stamps and reserving. As no event came in after re-reading the write stamp, that event is known to be the time of the event directly before this event and the delta can be the new time stamp and the write stamp. In case b, one or more events came in between reserving the event and re-reading he write stamp. Since this event's buffer reservation is between other events at this path, there's no way to know what the delta is. But because an event interrupted this event after it started, its fine to just give a zero delta, and take the same time stamp as the events that happened within the event being recorded. Here's the implementation of the design of this solution: All this is per cpu, and only needs to worry about nested events (not parallel events). The players: write_tail: The index in the buffer where new events can be written to. It is incremented via local_add() to reserve space for a new event. before_stamp: A time stamp set by all events before reserving space. write_stamp: A time stamp updated by events after it has successfully reserved space. /* Save the current position of write */ [A] w = local_read(write_tail); barrier(); /* Read both before and write stamps before touching anything */ before = local_read(before_stamp); after = local_read(write_stamp); barrier(); /* * If before and after are the same, then this event is not * interrupting a time update. If it is, then reserve space for adding * a full time stamp (this can turn into a time extend which is * just an extended time delta but fill up the extra space). */ if (after != before) abs = true; ts = clock(); /* Now update the before_stamp (everyone does this!) */ [B] local_set(before_stamp, ts); /* Now reserve space on the buffer */ [C] write = local_add_return(len, write_tail); /* Set tail to be were this event's data is */ tail = write - len; if (w == tail) { /* Nothing interrupted this between A and C */ [D] local_set(write_stamp, ts); barrier(); [E] save_before = local_read(before_stamp); if (!abs) { /* This did not interrupt a time update */ delta = ts - after; } else { delta = ts; /* The full time stamp will be in use */ } if (ts != save_before) { /* slow path - Was interrupted between C and E */ /* The update to write_stamp could have overwritten the update to * it by the interrupting event, but before and after should be * the same for all completed top events */ after = local_read(write_stamp); if (save_before > after) local_cmpxchg(write_stamp, after, save_before); } } else { /* slow path - Interrupted between A and C */ after = local_read(write_stamp); temp_ts = clock(); barrier(); [F] if (write == local_read(write_tail) && after < temp_ts) { /* This was not interrupted since C and F * The last write_stamp is still valid for the previous event * in the buffer. */ delta = temp_ts - after; /* OK to keep this new time stamp */ ts = temp_ts; } else { /* Interrupted between C and F * Well, there's no use to try to know what the time stamp * is for the previous event. Just set delta to zero and * be the same time as that event that interrupted us before * the reservation of the buffer. */ delta = 0; } /* No need to use full timestamps here */ abs = 0; } Link: https://lkml.kernel.org/r/20200625094454.732790f7@oasis.local.home Link: https://lore.kernel.org/r/20200627010041.517736087@goodmis.org Link: http://lkml.kernel.org/r/20200629025258.957440797@goodmis.org Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:25 +08:00
barrier();
info->ts = rb_time_stamp(cpu_buffer->buffer);
if ((info->add_timestamp & RB_ADD_STAMP_ABSOLUTE)) {
ring-buffer: Have nested events still record running time stamp Up until now, if an event is interrupted while it is recorded by an interrupt, and that interrupt records events, the time of those events will all be the same. This is because events only record the delta of the time since the previous event (or beginning of a page), and to handle updating the time keeping for that of nested events is extremely racy. After years of thinking about this and several failed attempts, I finally have a solution to solve this puzzle. The problem is that you need to atomically calculate the delta and then update the time stamp you made the delta from, as well as then record it into the buffer, all this while at any time an interrupt can come in and do the same thing. This is easy to solve with heavy weight atomics, but that would be detrimental to the performance of the ring buffer. The current state of affairs sacrificed the time deltas for nested events for performance. The reason for previous failed attempts at solving this puzzle was because I was trying to completely avoid slow atomic operations like cmpxchg. I final came to the conclusion to always avoid cmpxchg is not possible, which is why those previous attempts always failed. But it is possible to pick one path (the most common case) and avoid cmpxchg in that path, which is the "fast path". The most common case is that an event will not be interrupted and have other events added into it. An event can detect if it has interrupted another event, and for these cases we can make it the slow path and use the heavy operations like cmpxchg. One more player was added to the game that made this possible, and that is the "absolute timestamp" (by Tom Zanussi) that allows us to inject a full 59 bit time stamp. (Of course this breaks if a machine is running for more than 18 years without a reboot!). There's barrier() placements around for being paranoid, even when they are not needed because of other atomic functions near by. But those should not hurt, as if they are not needed, they basically become a nop. Note, this also makes the race window much smaller, which means there are less slow paths to slow down the performance. The basic idea is that there's two main paths taken. 1) Not being interrupted between time stamps and reserving buffer space. In this case, the time stamps taken are true to the location in the buffer. 2) Was interrupted by another path between taking time stamps and reserving buffer space. The objective is to know what the delta is from the last reserved location in the buffer. As it is possible to detect if an event is interrupting another event before reserving data, space is added to the length to be reserved to inject a full time stamp along with the event being reserved. When an event is not interrupted, the write stamp is always the time of the last event written to the buffer. In path 1, there's two sub paths we care about: a) The event did not interrupt another event. b) The event interrupted another event. In case a, as the write stamp was read and known to be correct, the delta between the current time stamp and the write stamp is the delta between the current event and the previously recorded event. In case b, extra space was reserved to just put the full time stamp into the buffer. Which is done, as stated, in this path the time stamp taken is known to match the location in the buffer. In path 2, there's also two sub paths we care about: a) The event was not interrupted by another event since it reserved space on the buffer and re-reading the write stamp. b) The event was interrupted by another event. In case a, the write stamp is that of the last event that interrupted this event between taking the time stamps and reserving. As no event came in after re-reading the write stamp, that event is known to be the time of the event directly before this event and the delta can be the new time stamp and the write stamp. In case b, one or more events came in between reserving the event and re-reading he write stamp. Since this event's buffer reservation is between other events at this path, there's no way to know what the delta is. But because an event interrupted this event after it started, its fine to just give a zero delta, and take the same time stamp as the events that happened within the event being recorded. Here's the implementation of the design of this solution: All this is per cpu, and only needs to worry about nested events (not parallel events). The players: write_tail: The index in the buffer where new events can be written to. It is incremented via local_add() to reserve space for a new event. before_stamp: A time stamp set by all events before reserving space. write_stamp: A time stamp updated by events after it has successfully reserved space. /* Save the current position of write */ [A] w = local_read(write_tail); barrier(); /* Read both before and write stamps before touching anything */ before = local_read(before_stamp); after = local_read(write_stamp); barrier(); /* * If before and after are the same, then this event is not * interrupting a time update. If it is, then reserve space for adding * a full time stamp (this can turn into a time extend which is * just an extended time delta but fill up the extra space). */ if (after != before) abs = true; ts = clock(); /* Now update the before_stamp (everyone does this!) */ [B] local_set(before_stamp, ts); /* Now reserve space on the buffer */ [C] write = local_add_return(len, write_tail); /* Set tail to be were this event's data is */ tail = write - len; if (w == tail) { /* Nothing interrupted this between A and C */ [D] local_set(write_stamp, ts); barrier(); [E] save_before = local_read(before_stamp); if (!abs) { /* This did not interrupt a time update */ delta = ts - after; } else { delta = ts; /* The full time stamp will be in use */ } if (ts != save_before) { /* slow path - Was interrupted between C and E */ /* The update to write_stamp could have overwritten the update to * it by the interrupting event, but before and after should be * the same for all completed top events */ after = local_read(write_stamp); if (save_before > after) local_cmpxchg(write_stamp, after, save_before); } } else { /* slow path - Interrupted between A and C */ after = local_read(write_stamp); temp_ts = clock(); barrier(); [F] if (write == local_read(write_tail) && after < temp_ts) { /* This was not interrupted since C and F * The last write_stamp is still valid for the previous event * in the buffer. */ delta = temp_ts - after; /* OK to keep this new time stamp */ ts = temp_ts; } else { /* Interrupted between C and F * Well, there's no use to try to know what the time stamp * is for the previous event. Just set delta to zero and * be the same time as that event that interrupted us before * the reservation of the buffer. */ delta = 0; } /* No need to use full timestamps here */ abs = 0; } Link: https://lkml.kernel.org/r/20200625094454.732790f7@oasis.local.home Link: https://lore.kernel.org/r/20200627010041.517736087@goodmis.org Link: http://lkml.kernel.org/r/20200629025258.957440797@goodmis.org Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:25 +08:00
info->delta = info->ts;
} else {
/*
* If interrupting an event time update, we may need an
* absolute timestamp.
* Don't bother if this is the start of a new page (w == 0).
*/
ring-buffer: Fix writing to the buffer with max_data_size The maximum ring buffer data size is the maximum size of data that can be recorded on the ring buffer. Events must be smaller than the sub buffer data size minus any meta data. This size is checked before trying to allocate from the ring buffer because the allocation assumes that the size will fit on the sub buffer. The maximum size was calculated as the size of a sub buffer page (which is currently PAGE_SIZE minus the sub buffer header) minus the size of the meta data of an individual event. But it missed the possible adding of a time stamp for events that are added long enough apart that the event meta data can't hold the time delta. When an event is added that is greater than the current BUF_MAX_DATA_SIZE minus the size of a time stamp, but still less than or equal to BUF_MAX_DATA_SIZE, the ring buffer would go into an infinite loop, looking for a page that can hold the event. Luckily, there's a check for this loop and after 1000 iterations and a warning is emitted and the ring buffer is disabled. But this should never happen. This can happen when a large event is added first, or after a long period where an absolute timestamp is prefixed to the event, increasing its size by 8 bytes. This passes the check and then goes into the algorithm that causes the infinite loop. For events that are the first event on the sub-buffer, it does not need to add a timestamp, because the sub-buffer itself contains an absolute timestamp, and adding one is redundant. The fix is to check if the event is to be the first event on the sub-buffer, and if it is, then do not add a timestamp. This also fixes 32 bit adding a timestamp when a read of before_stamp or write_stamp is interrupted. There's still no need to add that timestamp if the event is going to be the first event on the sub buffer. Also, if the buffer has "time_stamp_abs" set, then also check if the length plus the timestamp is greater than the BUF_MAX_DATA_SIZE. Link: https://lore.kernel.org/all/20231212104549.58863438@gandalf.local.home/ Link: https://lore.kernel.org/linux-trace-kernel/20231212071837.5fdd6c13@gandalf.local.home Link: https://lore.kernel.org/linux-trace-kernel/20231212111617.39e02849@gandalf.local.home Cc: stable@vger.kernel.org Cc: Mark Rutland <mark.rutland@arm.com> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Fixes: a4543a2fa9ef3 ("ring-buffer: Get timestamp after event is allocated") Fixes: 58fbc3c63275c ("ring-buffer: Consolidate add_timestamp to remove some branches") Reported-by: Kent Overstreet <kent.overstreet@linux.dev> # (on IRC) Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-12-13 00:16:17 +08:00
if (!w) {
/* Use the sub-buffer timestamp */
info->delta = 0;
} else if (unlikely(!a_ok || !b_ok || info->before != info->after)) {
info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND;
info->length += RB_LEN_TIME_EXTEND;
} else {
info->delta = info->ts - info->after;
if (unlikely(test_time_stamp(info->delta))) {
info->add_timestamp |= RB_ADD_STAMP_EXTEND;
info->length += RB_LEN_TIME_EXTEND;
}
ring-buffer: Add rb_time_t 64 bit operations for speeding up 32 bit After a discussion with the new time algorithm to have nested events still have proper time keeping but required using local64_t atomic operations. Mathieu was concerned about the performance this would have on 32 bit machines, as in most cases, atomic 64 bit operations on them can be expensive. As the ring buffer's timing needs do not require full features of local64_t, a wrapper is made to implement a new rb_time_t operation that uses two longs on 32 bit machines but still uses the local64_t operations on 64 bit machines. There's a switch that can be made in the file to force 64 bit to use the 32 bit version just for testing purposes. All reads do not need to succeed if a read happened while the stamp being read is in the process of being updated. The requirement is that all reads must succed that were done by an interrupting event (where this event was interrupted by another event that did the write). Or if the event itself did the write first. That is: rb_time_set(t, x) followed by rb_time_read(t) will always succeed (even if it gets interrupted by another event that writes to t. The result of the read will be either the previous set, or a set performed by an interrupting event. If the read is done by an event that interrupted another event that was in the process of setting the time stamp, and no other event came along to write to that time stamp, it will fail and the rb_time_read() will return that it failed (the value to read will be undefined). A set will always write to the time stamp and return with a valid time stamp, such that any read after it will be valid. A cmpxchg may fail if it interrupted an event that was in the process of updating the time stamp just like the reads do. Other than that, it will act like a normal cmpxchg. The way this works is that the rb_time_t is made of of three fields. A cnt, that gets updated atomically everyting a modification is made. A top that represents the most significant 30 bits of the time, and a bottom to represent the least significant 30 bits of the time. Notice, that the time values is only 60 bits long (where the ring buffer only uses 59 bits, which gives us 18 years of nanoseconds!). The top two bits of both the top and bottom is a 2 bit counter that gets set by the value of the least two significant bits of the cnt. A read of the top and the bottom where both the top and bottom have the same most significant top 2 bits, are considered a match and a valid 60 bit number can be created from it. If they do not match, then the number is considered invalid, and this must only happen if an event interrupted another event in the midst of updating the time stamp. This is only used for 32 bits machines as 64 bit machines can get better performance out of the local64_t. This has been tested heavily by forcing 64 bit to use this logic. Link: https://lore.kernel.org/r/20200625225345.18cf5881@oasis.local.home Link: http://lkml.kernel.org/r/20200629025259.309232719@goodmis.org Inspired-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:27 +08:00
}
}
ring-buffer: Add rb_time_t 64 bit operations for speeding up 32 bit After a discussion with the new time algorithm to have nested events still have proper time keeping but required using local64_t atomic operations. Mathieu was concerned about the performance this would have on 32 bit machines, as in most cases, atomic 64 bit operations on them can be expensive. As the ring buffer's timing needs do not require full features of local64_t, a wrapper is made to implement a new rb_time_t operation that uses two longs on 32 bit machines but still uses the local64_t operations on 64 bit machines. There's a switch that can be made in the file to force 64 bit to use the 32 bit version just for testing purposes. All reads do not need to succeed if a read happened while the stamp being read is in the process of being updated. The requirement is that all reads must succed that were done by an interrupting event (where this event was interrupted by another event that did the write). Or if the event itself did the write first. That is: rb_time_set(t, x) followed by rb_time_read(t) will always succeed (even if it gets interrupted by another event that writes to t. The result of the read will be either the previous set, or a set performed by an interrupting event. If the read is done by an event that interrupted another event that was in the process of setting the time stamp, and no other event came along to write to that time stamp, it will fail and the rb_time_read() will return that it failed (the value to read will be undefined). A set will always write to the time stamp and return with a valid time stamp, such that any read after it will be valid. A cmpxchg may fail if it interrupted an event that was in the process of updating the time stamp just like the reads do. Other than that, it will act like a normal cmpxchg. The way this works is that the rb_time_t is made of of three fields. A cnt, that gets updated atomically everyting a modification is made. A top that represents the most significant 30 bits of the time, and a bottom to represent the least significant 30 bits of the time. Notice, that the time values is only 60 bits long (where the ring buffer only uses 59 bits, which gives us 18 years of nanoseconds!). The top two bits of both the top and bottom is a 2 bit counter that gets set by the value of the least two significant bits of the cnt. A read of the top and the bottom where both the top and bottom have the same most significant top 2 bits, are considered a match and a valid 60 bit number can be created from it. If they do not match, then the number is considered invalid, and this must only happen if an event interrupted another event in the midst of updating the time stamp. This is only used for 32 bits machines as 64 bit machines can get better performance out of the local64_t. This has been tested heavily by forcing 64 bit to use this logic. Link: https://lore.kernel.org/r/20200625225345.18cf5881@oasis.local.home Link: http://lkml.kernel.org/r/20200629025259.309232719@goodmis.org Inspired-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:27 +08:00
/*B*/ rb_time_set(&cpu_buffer->before_stamp, info->ts);
ring-buffer: Have nested events still record running time stamp Up until now, if an event is interrupted while it is recorded by an interrupt, and that interrupt records events, the time of those events will all be the same. This is because events only record the delta of the time since the previous event (or beginning of a page), and to handle updating the time keeping for that of nested events is extremely racy. After years of thinking about this and several failed attempts, I finally have a solution to solve this puzzle. The problem is that you need to atomically calculate the delta and then update the time stamp you made the delta from, as well as then record it into the buffer, all this while at any time an interrupt can come in and do the same thing. This is easy to solve with heavy weight atomics, but that would be detrimental to the performance of the ring buffer. The current state of affairs sacrificed the time deltas for nested events for performance. The reason for previous failed attempts at solving this puzzle was because I was trying to completely avoid slow atomic operations like cmpxchg. I final came to the conclusion to always avoid cmpxchg is not possible, which is why those previous attempts always failed. But it is possible to pick one path (the most common case) and avoid cmpxchg in that path, which is the "fast path". The most common case is that an event will not be interrupted and have other events added into it. An event can detect if it has interrupted another event, and for these cases we can make it the slow path and use the heavy operations like cmpxchg. One more player was added to the game that made this possible, and that is the "absolute timestamp" (by Tom Zanussi) that allows us to inject a full 59 bit time stamp. (Of course this breaks if a machine is running for more than 18 years without a reboot!). There's barrier() placements around for being paranoid, even when they are not needed because of other atomic functions near by. But those should not hurt, as if they are not needed, they basically become a nop. Note, this also makes the race window much smaller, which means there are less slow paths to slow down the performance. The basic idea is that there's two main paths taken. 1) Not being interrupted between time stamps and reserving buffer space. In this case, the time stamps taken are true to the location in the buffer. 2) Was interrupted by another path between taking time stamps and reserving buffer space. The objective is to know what the delta is from the last reserved location in the buffer. As it is possible to detect if an event is interrupting another event before reserving data, space is added to the length to be reserved to inject a full time stamp along with the event being reserved. When an event is not interrupted, the write stamp is always the time of the last event written to the buffer. In path 1, there's two sub paths we care about: a) The event did not interrupt another event. b) The event interrupted another event. In case a, as the write stamp was read and known to be correct, the delta between the current time stamp and the write stamp is the delta between the current event and the previously recorded event. In case b, extra space was reserved to just put the full time stamp into the buffer. Which is done, as stated, in this path the time stamp taken is known to match the location in the buffer. In path 2, there's also two sub paths we care about: a) The event was not interrupted by another event since it reserved space on the buffer and re-reading the write stamp. b) The event was interrupted by another event. In case a, the write stamp is that of the last event that interrupted this event between taking the time stamps and reserving. As no event came in after re-reading the write stamp, that event is known to be the time of the event directly before this event and the delta can be the new time stamp and the write stamp. In case b, one or more events came in between reserving the event and re-reading he write stamp. Since this event's buffer reservation is between other events at this path, there's no way to know what the delta is. But because an event interrupted this event after it started, its fine to just give a zero delta, and take the same time stamp as the events that happened within the event being recorded. Here's the implementation of the design of this solution: All this is per cpu, and only needs to worry about nested events (not parallel events). The players: write_tail: The index in the buffer where new events can be written to. It is incremented via local_add() to reserve space for a new event. before_stamp: A time stamp set by all events before reserving space. write_stamp: A time stamp updated by events after it has successfully reserved space. /* Save the current position of write */ [A] w = local_read(write_tail); barrier(); /* Read both before and write stamps before touching anything */ before = local_read(before_stamp); after = local_read(write_stamp); barrier(); /* * If before and after are the same, then this event is not * interrupting a time update. If it is, then reserve space for adding * a full time stamp (this can turn into a time extend which is * just an extended time delta but fill up the extra space). */ if (after != before) abs = true; ts = clock(); /* Now update the before_stamp (everyone does this!) */ [B] local_set(before_stamp, ts); /* Now reserve space on the buffer */ [C] write = local_add_return(len, write_tail); /* Set tail to be were this event's data is */ tail = write - len; if (w == tail) { /* Nothing interrupted this between A and C */ [D] local_set(write_stamp, ts); barrier(); [E] save_before = local_read(before_stamp); if (!abs) { /* This did not interrupt a time update */ delta = ts - after; } else { delta = ts; /* The full time stamp will be in use */ } if (ts != save_before) { /* slow path - Was interrupted between C and E */ /* The update to write_stamp could have overwritten the update to * it by the interrupting event, but before and after should be * the same for all completed top events */ after = local_read(write_stamp); if (save_before > after) local_cmpxchg(write_stamp, after, save_before); } } else { /* slow path - Interrupted between A and C */ after = local_read(write_stamp); temp_ts = clock(); barrier(); [F] if (write == local_read(write_tail) && after < temp_ts) { /* This was not interrupted since C and F * The last write_stamp is still valid for the previous event * in the buffer. */ delta = temp_ts - after; /* OK to keep this new time stamp */ ts = temp_ts; } else { /* Interrupted between C and F * Well, there's no use to try to know what the time stamp * is for the previous event. Just set delta to zero and * be the same time as that event that interrupted us before * the reservation of the buffer. */ delta = 0; } /* No need to use full timestamps here */ abs = 0; } Link: https://lkml.kernel.org/r/20200625094454.732790f7@oasis.local.home Link: https://lore.kernel.org/r/20200627010041.517736087@goodmis.org Link: http://lkml.kernel.org/r/20200629025258.957440797@goodmis.org Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:25 +08:00
/*C*/ write = local_add_return(info->length, &tail_page->write);
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
/* set write to only the index of the write */
write &= RB_WRITE_MASK;
ring-buffer: Have nested events still record running time stamp Up until now, if an event is interrupted while it is recorded by an interrupt, and that interrupt records events, the time of those events will all be the same. This is because events only record the delta of the time since the previous event (or beginning of a page), and to handle updating the time keeping for that of nested events is extremely racy. After years of thinking about this and several failed attempts, I finally have a solution to solve this puzzle. The problem is that you need to atomically calculate the delta and then update the time stamp you made the delta from, as well as then record it into the buffer, all this while at any time an interrupt can come in and do the same thing. This is easy to solve with heavy weight atomics, but that would be detrimental to the performance of the ring buffer. The current state of affairs sacrificed the time deltas for nested events for performance. The reason for previous failed attempts at solving this puzzle was because I was trying to completely avoid slow atomic operations like cmpxchg. I final came to the conclusion to always avoid cmpxchg is not possible, which is why those previous attempts always failed. But it is possible to pick one path (the most common case) and avoid cmpxchg in that path, which is the "fast path". The most common case is that an event will not be interrupted and have other events added into it. An event can detect if it has interrupted another event, and for these cases we can make it the slow path and use the heavy operations like cmpxchg. One more player was added to the game that made this possible, and that is the "absolute timestamp" (by Tom Zanussi) that allows us to inject a full 59 bit time stamp. (Of course this breaks if a machine is running for more than 18 years without a reboot!). There's barrier() placements around for being paranoid, even when they are not needed because of other atomic functions near by. But those should not hurt, as if they are not needed, they basically become a nop. Note, this also makes the race window much smaller, which means there are less slow paths to slow down the performance. The basic idea is that there's two main paths taken. 1) Not being interrupted between time stamps and reserving buffer space. In this case, the time stamps taken are true to the location in the buffer. 2) Was interrupted by another path between taking time stamps and reserving buffer space. The objective is to know what the delta is from the last reserved location in the buffer. As it is possible to detect if an event is interrupting another event before reserving data, space is added to the length to be reserved to inject a full time stamp along with the event being reserved. When an event is not interrupted, the write stamp is always the time of the last event written to the buffer. In path 1, there's two sub paths we care about: a) The event did not interrupt another event. b) The event interrupted another event. In case a, as the write stamp was read and known to be correct, the delta between the current time stamp and the write stamp is the delta between the current event and the previously recorded event. In case b, extra space was reserved to just put the full time stamp into the buffer. Which is done, as stated, in this path the time stamp taken is known to match the location in the buffer. In path 2, there's also two sub paths we care about: a) The event was not interrupted by another event since it reserved space on the buffer and re-reading the write stamp. b) The event was interrupted by another event. In case a, the write stamp is that of the last event that interrupted this event between taking the time stamps and reserving. As no event came in after re-reading the write stamp, that event is known to be the time of the event directly before this event and the delta can be the new time stamp and the write stamp. In case b, one or more events came in between reserving the event and re-reading he write stamp. Since this event's buffer reservation is between other events at this path, there's no way to know what the delta is. But because an event interrupted this event after it started, its fine to just give a zero delta, and take the same time stamp as the events that happened within the event being recorded. Here's the implementation of the design of this solution: All this is per cpu, and only needs to worry about nested events (not parallel events). The players: write_tail: The index in the buffer where new events can be written to. It is incremented via local_add() to reserve space for a new event. before_stamp: A time stamp set by all events before reserving space. write_stamp: A time stamp updated by events after it has successfully reserved space. /* Save the current position of write */ [A] w = local_read(write_tail); barrier(); /* Read both before and write stamps before touching anything */ before = local_read(before_stamp); after = local_read(write_stamp); barrier(); /* * If before and after are the same, then this event is not * interrupting a time update. If it is, then reserve space for adding * a full time stamp (this can turn into a time extend which is * just an extended time delta but fill up the extra space). */ if (after != before) abs = true; ts = clock(); /* Now update the before_stamp (everyone does this!) */ [B] local_set(before_stamp, ts); /* Now reserve space on the buffer */ [C] write = local_add_return(len, write_tail); /* Set tail to be were this event's data is */ tail = write - len; if (w == tail) { /* Nothing interrupted this between A and C */ [D] local_set(write_stamp, ts); barrier(); [E] save_before = local_read(before_stamp); if (!abs) { /* This did not interrupt a time update */ delta = ts - after; } else { delta = ts; /* The full time stamp will be in use */ } if (ts != save_before) { /* slow path - Was interrupted between C and E */ /* The update to write_stamp could have overwritten the update to * it by the interrupting event, but before and after should be * the same for all completed top events */ after = local_read(write_stamp); if (save_before > after) local_cmpxchg(write_stamp, after, save_before); } } else { /* slow path - Interrupted between A and C */ after = local_read(write_stamp); temp_ts = clock(); barrier(); [F] if (write == local_read(write_tail) && after < temp_ts) { /* This was not interrupted since C and F * The last write_stamp is still valid for the previous event * in the buffer. */ delta = temp_ts - after; /* OK to keep this new time stamp */ ts = temp_ts; } else { /* Interrupted between C and F * Well, there's no use to try to know what the time stamp * is for the previous event. Just set delta to zero and * be the same time as that event that interrupted us before * the reservation of the buffer. */ delta = 0; } /* No need to use full timestamps here */ abs = 0; } Link: https://lkml.kernel.org/r/20200625094454.732790f7@oasis.local.home Link: https://lore.kernel.org/r/20200627010041.517736087@goodmis.org Link: http://lkml.kernel.org/r/20200629025258.957440797@goodmis.org Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:25 +08:00
tail = write - info->length;
ring-buffer: Have nested events still record running time stamp Up until now, if an event is interrupted while it is recorded by an interrupt, and that interrupt records events, the time of those events will all be the same. This is because events only record the delta of the time since the previous event (or beginning of a page), and to handle updating the time keeping for that of nested events is extremely racy. After years of thinking about this and several failed attempts, I finally have a solution to solve this puzzle. The problem is that you need to atomically calculate the delta and then update the time stamp you made the delta from, as well as then record it into the buffer, all this while at any time an interrupt can come in and do the same thing. This is easy to solve with heavy weight atomics, but that would be detrimental to the performance of the ring buffer. The current state of affairs sacrificed the time deltas for nested events for performance. The reason for previous failed attempts at solving this puzzle was because I was trying to completely avoid slow atomic operations like cmpxchg. I final came to the conclusion to always avoid cmpxchg is not possible, which is why those previous attempts always failed. But it is possible to pick one path (the most common case) and avoid cmpxchg in that path, which is the "fast path". The most common case is that an event will not be interrupted and have other events added into it. An event can detect if it has interrupted another event, and for these cases we can make it the slow path and use the heavy operations like cmpxchg. One more player was added to the game that made this possible, and that is the "absolute timestamp" (by Tom Zanussi) that allows us to inject a full 59 bit time stamp. (Of course this breaks if a machine is running for more than 18 years without a reboot!). There's barrier() placements around for being paranoid, even when they are not needed because of other atomic functions near by. But those should not hurt, as if they are not needed, they basically become a nop. Note, this also makes the race window much smaller, which means there are less slow paths to slow down the performance. The basic idea is that there's two main paths taken. 1) Not being interrupted between time stamps and reserving buffer space. In this case, the time stamps taken are true to the location in the buffer. 2) Was interrupted by another path between taking time stamps and reserving buffer space. The objective is to know what the delta is from the last reserved location in the buffer. As it is possible to detect if an event is interrupting another event before reserving data, space is added to the length to be reserved to inject a full time stamp along with the event being reserved. When an event is not interrupted, the write stamp is always the time of the last event written to the buffer. In path 1, there's two sub paths we care about: a) The event did not interrupt another event. b) The event interrupted another event. In case a, as the write stamp was read and known to be correct, the delta between the current time stamp and the write stamp is the delta between the current event and the previously recorded event. In case b, extra space was reserved to just put the full time stamp into the buffer. Which is done, as stated, in this path the time stamp taken is known to match the location in the buffer. In path 2, there's also two sub paths we care about: a) The event was not interrupted by another event since it reserved space on the buffer and re-reading the write stamp. b) The event was interrupted by another event. In case a, the write stamp is that of the last event that interrupted this event between taking the time stamps and reserving. As no event came in after re-reading the write stamp, that event is known to be the time of the event directly before this event and the delta can be the new time stamp and the write stamp. In case b, one or more events came in between reserving the event and re-reading he write stamp. Since this event's buffer reservation is between other events at this path, there's no way to know what the delta is. But because an event interrupted this event after it started, its fine to just give a zero delta, and take the same time stamp as the events that happened within the event being recorded. Here's the implementation of the design of this solution: All this is per cpu, and only needs to worry about nested events (not parallel events). The players: write_tail: The index in the buffer where new events can be written to. It is incremented via local_add() to reserve space for a new event. before_stamp: A time stamp set by all events before reserving space. write_stamp: A time stamp updated by events after it has successfully reserved space. /* Save the current position of write */ [A] w = local_read(write_tail); barrier(); /* Read both before and write stamps before touching anything */ before = local_read(before_stamp); after = local_read(write_stamp); barrier(); /* * If before and after are the same, then this event is not * interrupting a time update. If it is, then reserve space for adding * a full time stamp (this can turn into a time extend which is * just an extended time delta but fill up the extra space). */ if (after != before) abs = true; ts = clock(); /* Now update the before_stamp (everyone does this!) */ [B] local_set(before_stamp, ts); /* Now reserve space on the buffer */ [C] write = local_add_return(len, write_tail); /* Set tail to be were this event's data is */ tail = write - len; if (w == tail) { /* Nothing interrupted this between A and C */ [D] local_set(write_stamp, ts); barrier(); [E] save_before = local_read(before_stamp); if (!abs) { /* This did not interrupt a time update */ delta = ts - after; } else { delta = ts; /* The full time stamp will be in use */ } if (ts != save_before) { /* slow path - Was interrupted between C and E */ /* The update to write_stamp could have overwritten the update to * it by the interrupting event, but before and after should be * the same for all completed top events */ after = local_read(write_stamp); if (save_before > after) local_cmpxchg(write_stamp, after, save_before); } } else { /* slow path - Interrupted between A and C */ after = local_read(write_stamp); temp_ts = clock(); barrier(); [F] if (write == local_read(write_tail) && after < temp_ts) { /* This was not interrupted since C and F * The last write_stamp is still valid for the previous event * in the buffer. */ delta = temp_ts - after; /* OK to keep this new time stamp */ ts = temp_ts; } else { /* Interrupted between C and F * Well, there's no use to try to know what the time stamp * is for the previous event. Just set delta to zero and * be the same time as that event that interrupted us before * the reservation of the buffer. */ delta = 0; } /* No need to use full timestamps here */ abs = 0; } Link: https://lkml.kernel.org/r/20200625094454.732790f7@oasis.local.home Link: https://lore.kernel.org/r/20200627010041.517736087@goodmis.org Link: http://lkml.kernel.org/r/20200629025258.957440797@goodmis.org Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:25 +08:00
/* See if we shot pass the end of this buffer page */
if (unlikely(write > BUF_PAGE_SIZE)) {
2023-12-12 00:44:20 +08:00
check_buffer(cpu_buffer, info, CHECK_FULL_PAGE);
ring-buffer: Have nested events still record running time stamp Up until now, if an event is interrupted while it is recorded by an interrupt, and that interrupt records events, the time of those events will all be the same. This is because events only record the delta of the time since the previous event (or beginning of a page), and to handle updating the time keeping for that of nested events is extremely racy. After years of thinking about this and several failed attempts, I finally have a solution to solve this puzzle. The problem is that you need to atomically calculate the delta and then update the time stamp you made the delta from, as well as then record it into the buffer, all this while at any time an interrupt can come in and do the same thing. This is easy to solve with heavy weight atomics, but that would be detrimental to the performance of the ring buffer. The current state of affairs sacrificed the time deltas for nested events for performance. The reason for previous failed attempts at solving this puzzle was because I was trying to completely avoid slow atomic operations like cmpxchg. I final came to the conclusion to always avoid cmpxchg is not possible, which is why those previous attempts always failed. But it is possible to pick one path (the most common case) and avoid cmpxchg in that path, which is the "fast path". The most common case is that an event will not be interrupted and have other events added into it. An event can detect if it has interrupted another event, and for these cases we can make it the slow path and use the heavy operations like cmpxchg. One more player was added to the game that made this possible, and that is the "absolute timestamp" (by Tom Zanussi) that allows us to inject a full 59 bit time stamp. (Of course this breaks if a machine is running for more than 18 years without a reboot!). There's barrier() placements around for being paranoid, even when they are not needed because of other atomic functions near by. But those should not hurt, as if they are not needed, they basically become a nop. Note, this also makes the race window much smaller, which means there are less slow paths to slow down the performance. The basic idea is that there's two main paths taken. 1) Not being interrupted between time stamps and reserving buffer space. In this case, the time stamps taken are true to the location in the buffer. 2) Was interrupted by another path between taking time stamps and reserving buffer space. The objective is to know what the delta is from the last reserved location in the buffer. As it is possible to detect if an event is interrupting another event before reserving data, space is added to the length to be reserved to inject a full time stamp along with the event being reserved. When an event is not interrupted, the write stamp is always the time of the last event written to the buffer. In path 1, there's two sub paths we care about: a) The event did not interrupt another event. b) The event interrupted another event. In case a, as the write stamp was read and known to be correct, the delta between the current time stamp and the write stamp is the delta between the current event and the previously recorded event. In case b, extra space was reserved to just put the full time stamp into the buffer. Which is done, as stated, in this path the time stamp taken is known to match the location in the buffer. In path 2, there's also two sub paths we care about: a) The event was not interrupted by another event since it reserved space on the buffer and re-reading the write stamp. b) The event was interrupted by another event. In case a, the write stamp is that of the last event that interrupted this event between taking the time stamps and reserving. As no event came in after re-reading the write stamp, that event is known to be the time of the event directly before this event and the delta can be the new time stamp and the write stamp. In case b, one or more events came in between reserving the event and re-reading he write stamp. Since this event's buffer reservation is between other events at this path, there's no way to know what the delta is. But because an event interrupted this event after it started, its fine to just give a zero delta, and take the same time stamp as the events that happened within the event being recorded. Here's the implementation of the design of this solution: All this is per cpu, and only needs to worry about nested events (not parallel events). The players: write_tail: The index in the buffer where new events can be written to. It is incremented via local_add() to reserve space for a new event. before_stamp: A time stamp set by all events before reserving space. write_stamp: A time stamp updated by events after it has successfully reserved space. /* Save the current position of write */ [A] w = local_read(write_tail); barrier(); /* Read both before and write stamps before touching anything */ before = local_read(before_stamp); after = local_read(write_stamp); barrier(); /* * If before and after are the same, then this event is not * interrupting a time update. If it is, then reserve space for adding * a full time stamp (this can turn into a time extend which is * just an extended time delta but fill up the extra space). */ if (after != before) abs = true; ts = clock(); /* Now update the before_stamp (everyone does this!) */ [B] local_set(before_stamp, ts); /* Now reserve space on the buffer */ [C] write = local_add_return(len, write_tail); /* Set tail to be were this event's data is */ tail = write - len; if (w == tail) { /* Nothing interrupted this between A and C */ [D] local_set(write_stamp, ts); barrier(); [E] save_before = local_read(before_stamp); if (!abs) { /* This did not interrupt a time update */ delta = ts - after; } else { delta = ts; /* The full time stamp will be in use */ } if (ts != save_before) { /* slow path - Was interrupted between C and E */ /* The update to write_stamp could have overwritten the update to * it by the interrupting event, but before and after should be * the same for all completed top events */ after = local_read(write_stamp); if (save_before > after) local_cmpxchg(write_stamp, after, save_before); } } else { /* slow path - Interrupted between A and C */ after = local_read(write_stamp); temp_ts = clock(); barrier(); [F] if (write == local_read(write_tail) && after < temp_ts) { /* This was not interrupted since C and F * The last write_stamp is still valid for the previous event * in the buffer. */ delta = temp_ts - after; /* OK to keep this new time stamp */ ts = temp_ts; } else { /* Interrupted between C and F * Well, there's no use to try to know what the time stamp * is for the previous event. Just set delta to zero and * be the same time as that event that interrupted us before * the reservation of the buffer. */ delta = 0; } /* No need to use full timestamps here */ abs = 0; } Link: https://lkml.kernel.org/r/20200625094454.732790f7@oasis.local.home Link: https://lore.kernel.org/r/20200627010041.517736087@goodmis.org Link: http://lkml.kernel.org/r/20200629025258.957440797@goodmis.org Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:25 +08:00
return rb_move_tail(cpu_buffer, tail, info);
}
if (likely(tail == w)) {
/* Nothing interrupted us between A and C */
ring-buffer: Add rb_time_t 64 bit operations for speeding up 32 bit After a discussion with the new time algorithm to have nested events still have proper time keeping but required using local64_t atomic operations. Mathieu was concerned about the performance this would have on 32 bit machines, as in most cases, atomic 64 bit operations on them can be expensive. As the ring buffer's timing needs do not require full features of local64_t, a wrapper is made to implement a new rb_time_t operation that uses two longs on 32 bit machines but still uses the local64_t operations on 64 bit machines. There's a switch that can be made in the file to force 64 bit to use the 32 bit version just for testing purposes. All reads do not need to succeed if a read happened while the stamp being read is in the process of being updated. The requirement is that all reads must succed that were done by an interrupting event (where this event was interrupted by another event that did the write). Or if the event itself did the write first. That is: rb_time_set(t, x) followed by rb_time_read(t) will always succeed (even if it gets interrupted by another event that writes to t. The result of the read will be either the previous set, or a set performed by an interrupting event. If the read is done by an event that interrupted another event that was in the process of setting the time stamp, and no other event came along to write to that time stamp, it will fail and the rb_time_read() will return that it failed (the value to read will be undefined). A set will always write to the time stamp and return with a valid time stamp, such that any read after it will be valid. A cmpxchg may fail if it interrupted an event that was in the process of updating the time stamp just like the reads do. Other than that, it will act like a normal cmpxchg. The way this works is that the rb_time_t is made of of three fields. A cnt, that gets updated atomically everyting a modification is made. A top that represents the most significant 30 bits of the time, and a bottom to represent the least significant 30 bits of the time. Notice, that the time values is only 60 bits long (where the ring buffer only uses 59 bits, which gives us 18 years of nanoseconds!). The top two bits of both the top and bottom is a 2 bit counter that gets set by the value of the least two significant bits of the cnt. A read of the top and the bottom where both the top and bottom have the same most significant top 2 bits, are considered a match and a valid 60 bit number can be created from it. If they do not match, then the number is considered invalid, and this must only happen if an event interrupted another event in the midst of updating the time stamp. This is only used for 32 bits machines as 64 bit machines can get better performance out of the local64_t. This has been tested heavily by forcing 64 bit to use this logic. Link: https://lore.kernel.org/r/20200625225345.18cf5881@oasis.local.home Link: http://lkml.kernel.org/r/20200629025259.309232719@goodmis.org Inspired-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:27 +08:00
/*D*/ rb_time_set(&cpu_buffer->write_stamp, info->ts);
ring-buffer: Do not try to put back write_stamp If an update to an event is interrupted by another event between the time the initial event allocated its buffer and where it wrote to the write_stamp, the code try to reset the write stamp back to the what it had just overwritten. It knows that it was overwritten via checking the before_stamp, and if it didn't match what it wrote to the before_stamp before it allocated its space, it knows it was overwritten. To put back the write_stamp, it uses the before_stamp it read. The problem here is that by writing the before_stamp to the write_stamp it makes the two equal again, which means that the write_stamp can be considered valid as the last timestamp written to the ring buffer. But this is not necessarily true. The event that interrupted the event could have been interrupted in a way that it was interrupted as well, and can end up leaving with an invalid write_stamp. But if this happens and returns to this context that uses the before_stamp to update the write_stamp again, it can possibly incorrectly make it valid, causing later events to have in correct time stamps. As it is OK to leave this function with an invalid write_stamp (one that doesn't match the before_stamp), there's no reason to try to make it valid again in this case. If this race happens, then just leave with the invalid write_stamp and the next event to come along will just add a absolute timestamp and validate everything again. Bonus points: This gets rid of another cmpxchg64! Link: https://lore.kernel.org/linux-trace-kernel/20231214222921.193037a7@gandalf.local.home Cc: stable@vger.kernel.org Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Joel Fernandes <joel@joelfernandes.org> Cc: Vincent Donnefort <vdonnefort@google.com> Fixes: a389d86f7fd09 ("ring-buffer: Have nested events still record running time stamp") Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-12-15 11:29:21 +08:00
/*
* If something came in between C and D, the write stamp
* may now not be in sync. But that's fine as the before_stamp
* will be different and then next event will just be forced
* to use an absolute timestamp.
*/
if (likely(!(info->add_timestamp &
(RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE))))
ring-buffer: Have nested events still record running time stamp Up until now, if an event is interrupted while it is recorded by an interrupt, and that interrupt records events, the time of those events will all be the same. This is because events only record the delta of the time since the previous event (or beginning of a page), and to handle updating the time keeping for that of nested events is extremely racy. After years of thinking about this and several failed attempts, I finally have a solution to solve this puzzle. The problem is that you need to atomically calculate the delta and then update the time stamp you made the delta from, as well as then record it into the buffer, all this while at any time an interrupt can come in and do the same thing. This is easy to solve with heavy weight atomics, but that would be detrimental to the performance of the ring buffer. The current state of affairs sacrificed the time deltas for nested events for performance. The reason for previous failed attempts at solving this puzzle was because I was trying to completely avoid slow atomic operations like cmpxchg. I final came to the conclusion to always avoid cmpxchg is not possible, which is why those previous attempts always failed. But it is possible to pick one path (the most common case) and avoid cmpxchg in that path, which is the "fast path". The most common case is that an event will not be interrupted and have other events added into it. An event can detect if it has interrupted another event, and for these cases we can make it the slow path and use the heavy operations like cmpxchg. One more player was added to the game that made this possible, and that is the "absolute timestamp" (by Tom Zanussi) that allows us to inject a full 59 bit time stamp. (Of course this breaks if a machine is running for more than 18 years without a reboot!). There's barrier() placements around for being paranoid, even when they are not needed because of other atomic functions near by. But those should not hurt, as if they are not needed, they basically become a nop. Note, this also makes the race window much smaller, which means there are less slow paths to slow down the performance. The basic idea is that there's two main paths taken. 1) Not being interrupted between time stamps and reserving buffer space. In this case, the time stamps taken are true to the location in the buffer. 2) Was interrupted by another path between taking time stamps and reserving buffer space. The objective is to know what the delta is from the last reserved location in the buffer. As it is possible to detect if an event is interrupting another event before reserving data, space is added to the length to be reserved to inject a full time stamp along with the event being reserved. When an event is not interrupted, the write stamp is always the time of the last event written to the buffer. In path 1, there's two sub paths we care about: a) The event did not interrupt another event. b) The event interrupted another event. In case a, as the write stamp was read and known to be correct, the delta between the current time stamp and the write stamp is the delta between the current event and the previously recorded event. In case b, extra space was reserved to just put the full time stamp into the buffer. Which is done, as stated, in this path the time stamp taken is known to match the location in the buffer. In path 2, there's also two sub paths we care about: a) The event was not interrupted by another event since it reserved space on the buffer and re-reading the write stamp. b) The event was interrupted by another event. In case a, the write stamp is that of the last event that interrupted this event between taking the time stamps and reserving. As no event came in after re-reading the write stamp, that event is known to be the time of the event directly before this event and the delta can be the new time stamp and the write stamp. In case b, one or more events came in between reserving the event and re-reading he write stamp. Since this event's buffer reservation is between other events at this path, there's no way to know what the delta is. But because an event interrupted this event after it started, its fine to just give a zero delta, and take the same time stamp as the events that happened within the event being recorded. Here's the implementation of the design of this solution: All this is per cpu, and only needs to worry about nested events (not parallel events). The players: write_tail: The index in the buffer where new events can be written to. It is incremented via local_add() to reserve space for a new event. before_stamp: A time stamp set by all events before reserving space. write_stamp: A time stamp updated by events after it has successfully reserved space. /* Save the current position of write */ [A] w = local_read(write_tail); barrier(); /* Read both before and write stamps before touching anything */ before = local_read(before_stamp); after = local_read(write_stamp); barrier(); /* * If before and after are the same, then this event is not * interrupting a time update. If it is, then reserve space for adding * a full time stamp (this can turn into a time extend which is * just an extended time delta but fill up the extra space). */ if (after != before) abs = true; ts = clock(); /* Now update the before_stamp (everyone does this!) */ [B] local_set(before_stamp, ts); /* Now reserve space on the buffer */ [C] write = local_add_return(len, write_tail); /* Set tail to be were this event's data is */ tail = write - len; if (w == tail) { /* Nothing interrupted this between A and C */ [D] local_set(write_stamp, ts); barrier(); [E] save_before = local_read(before_stamp); if (!abs) { /* This did not interrupt a time update */ delta = ts - after; } else { delta = ts; /* The full time stamp will be in use */ } if (ts != save_before) { /* slow path - Was interrupted between C and E */ /* The update to write_stamp could have overwritten the update to * it by the interrupting event, but before and after should be * the same for all completed top events */ after = local_read(write_stamp); if (save_before > after) local_cmpxchg(write_stamp, after, save_before); } } else { /* slow path - Interrupted between A and C */ after = local_read(write_stamp); temp_ts = clock(); barrier(); [F] if (write == local_read(write_tail) && after < temp_ts) { /* This was not interrupted since C and F * The last write_stamp is still valid for the previous event * in the buffer. */ delta = temp_ts - after; /* OK to keep this new time stamp */ ts = temp_ts; } else { /* Interrupted between C and F * Well, there's no use to try to know what the time stamp * is for the previous event. Just set delta to zero and * be the same time as that event that interrupted us before * the reservation of the buffer. */ delta = 0; } /* No need to use full timestamps here */ abs = 0; } Link: https://lkml.kernel.org/r/20200625094454.732790f7@oasis.local.home Link: https://lore.kernel.org/r/20200627010041.517736087@goodmis.org Link: http://lkml.kernel.org/r/20200629025258.957440797@goodmis.org Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:25 +08:00
/* This did not interrupt any time update */
info->delta = info->ts - info->after;
ring-buffer: Have nested events still record running time stamp Up until now, if an event is interrupted while it is recorded by an interrupt, and that interrupt records events, the time of those events will all be the same. This is because events only record the delta of the time since the previous event (or beginning of a page), and to handle updating the time keeping for that of nested events is extremely racy. After years of thinking about this and several failed attempts, I finally have a solution to solve this puzzle. The problem is that you need to atomically calculate the delta and then update the time stamp you made the delta from, as well as then record it into the buffer, all this while at any time an interrupt can come in and do the same thing. This is easy to solve with heavy weight atomics, but that would be detrimental to the performance of the ring buffer. The current state of affairs sacrificed the time deltas for nested events for performance. The reason for previous failed attempts at solving this puzzle was because I was trying to completely avoid slow atomic operations like cmpxchg. I final came to the conclusion to always avoid cmpxchg is not possible, which is why those previous attempts always failed. But it is possible to pick one path (the most common case) and avoid cmpxchg in that path, which is the "fast path". The most common case is that an event will not be interrupted and have other events added into it. An event can detect if it has interrupted another event, and for these cases we can make it the slow path and use the heavy operations like cmpxchg. One more player was added to the game that made this possible, and that is the "absolute timestamp" (by Tom Zanussi) that allows us to inject a full 59 bit time stamp. (Of course this breaks if a machine is running for more than 18 years without a reboot!). There's barrier() placements around for being paranoid, even when they are not needed because of other atomic functions near by. But those should not hurt, as if they are not needed, they basically become a nop. Note, this also makes the race window much smaller, which means there are less slow paths to slow down the performance. The basic idea is that there's two main paths taken. 1) Not being interrupted between time stamps and reserving buffer space. In this case, the time stamps taken are true to the location in the buffer. 2) Was interrupted by another path between taking time stamps and reserving buffer space. The objective is to know what the delta is from the last reserved location in the buffer. As it is possible to detect if an event is interrupting another event before reserving data, space is added to the length to be reserved to inject a full time stamp along with the event being reserved. When an event is not interrupted, the write stamp is always the time of the last event written to the buffer. In path 1, there's two sub paths we care about: a) The event did not interrupt another event. b) The event interrupted another event. In case a, as the write stamp was read and known to be correct, the delta between the current time stamp and the write stamp is the delta between the current event and the previously recorded event. In case b, extra space was reserved to just put the full time stamp into the buffer. Which is done, as stated, in this path the time stamp taken is known to match the location in the buffer. In path 2, there's also two sub paths we care about: a) The event was not interrupted by another event since it reserved space on the buffer and re-reading the write stamp. b) The event was interrupted by another event. In case a, the write stamp is that of the last event that interrupted this event between taking the time stamps and reserving. As no event came in after re-reading the write stamp, that event is known to be the time of the event directly before this event and the delta can be the new time stamp and the write stamp. In case b, one or more events came in between reserving the event and re-reading he write stamp. Since this event's buffer reservation is between other events at this path, there's no way to know what the delta is. But because an event interrupted this event after it started, its fine to just give a zero delta, and take the same time stamp as the events that happened within the event being recorded. Here's the implementation of the design of this solution: All this is per cpu, and only needs to worry about nested events (not parallel events). The players: write_tail: The index in the buffer where new events can be written to. It is incremented via local_add() to reserve space for a new event. before_stamp: A time stamp set by all events before reserving space. write_stamp: A time stamp updated by events after it has successfully reserved space. /* Save the current position of write */ [A] w = local_read(write_tail); barrier(); /* Read both before and write stamps before touching anything */ before = local_read(before_stamp); after = local_read(write_stamp); barrier(); /* * If before and after are the same, then this event is not * interrupting a time update. If it is, then reserve space for adding * a full time stamp (this can turn into a time extend which is * just an extended time delta but fill up the extra space). */ if (after != before) abs = true; ts = clock(); /* Now update the before_stamp (everyone does this!) */ [B] local_set(before_stamp, ts); /* Now reserve space on the buffer */ [C] write = local_add_return(len, write_tail); /* Set tail to be were this event's data is */ tail = write - len; if (w == tail) { /* Nothing interrupted this between A and C */ [D] local_set(write_stamp, ts); barrier(); [E] save_before = local_read(before_stamp); if (!abs) { /* This did not interrupt a time update */ delta = ts - after; } else { delta = ts; /* The full time stamp will be in use */ } if (ts != save_before) { /* slow path - Was interrupted between C and E */ /* The update to write_stamp could have overwritten the update to * it by the interrupting event, but before and after should be * the same for all completed top events */ after = local_read(write_stamp); if (save_before > after) local_cmpxchg(write_stamp, after, save_before); } } else { /* slow path - Interrupted between A and C */ after = local_read(write_stamp); temp_ts = clock(); barrier(); [F] if (write == local_read(write_tail) && after < temp_ts) { /* This was not interrupted since C and F * The last write_stamp is still valid for the previous event * in the buffer. */ delta = temp_ts - after; /* OK to keep this new time stamp */ ts = temp_ts; } else { /* Interrupted between C and F * Well, there's no use to try to know what the time stamp * is for the previous event. Just set delta to zero and * be the same time as that event that interrupted us before * the reservation of the buffer. */ delta = 0; } /* No need to use full timestamps here */ abs = 0; } Link: https://lkml.kernel.org/r/20200625094454.732790f7@oasis.local.home Link: https://lore.kernel.org/r/20200627010041.517736087@goodmis.org Link: http://lkml.kernel.org/r/20200629025258.957440797@goodmis.org Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:25 +08:00
else
/* Just use full timestamp for interrupting event */
ring-buffer: Have nested events still record running time stamp Up until now, if an event is interrupted while it is recorded by an interrupt, and that interrupt records events, the time of those events will all be the same. This is because events only record the delta of the time since the previous event (or beginning of a page), and to handle updating the time keeping for that of nested events is extremely racy. After years of thinking about this and several failed attempts, I finally have a solution to solve this puzzle. The problem is that you need to atomically calculate the delta and then update the time stamp you made the delta from, as well as then record it into the buffer, all this while at any time an interrupt can come in and do the same thing. This is easy to solve with heavy weight atomics, but that would be detrimental to the performance of the ring buffer. The current state of affairs sacrificed the time deltas for nested events for performance. The reason for previous failed attempts at solving this puzzle was because I was trying to completely avoid slow atomic operations like cmpxchg. I final came to the conclusion to always avoid cmpxchg is not possible, which is why those previous attempts always failed. But it is possible to pick one path (the most common case) and avoid cmpxchg in that path, which is the "fast path". The most common case is that an event will not be interrupted and have other events added into it. An event can detect if it has interrupted another event, and for these cases we can make it the slow path and use the heavy operations like cmpxchg. One more player was added to the game that made this possible, and that is the "absolute timestamp" (by Tom Zanussi) that allows us to inject a full 59 bit time stamp. (Of course this breaks if a machine is running for more than 18 years without a reboot!). There's barrier() placements around for being paranoid, even when they are not needed because of other atomic functions near by. But those should not hurt, as if they are not needed, they basically become a nop. Note, this also makes the race window much smaller, which means there are less slow paths to slow down the performance. The basic idea is that there's two main paths taken. 1) Not being interrupted between time stamps and reserving buffer space. In this case, the time stamps taken are true to the location in the buffer. 2) Was interrupted by another path between taking time stamps and reserving buffer space. The objective is to know what the delta is from the last reserved location in the buffer. As it is possible to detect if an event is interrupting another event before reserving data, space is added to the length to be reserved to inject a full time stamp along with the event being reserved. When an event is not interrupted, the write stamp is always the time of the last event written to the buffer. In path 1, there's two sub paths we care about: a) The event did not interrupt another event. b) The event interrupted another event. In case a, as the write stamp was read and known to be correct, the delta between the current time stamp and the write stamp is the delta between the current event and the previously recorded event. In case b, extra space was reserved to just put the full time stamp into the buffer. Which is done, as stated, in this path the time stamp taken is known to match the location in the buffer. In path 2, there's also two sub paths we care about: a) The event was not interrupted by another event since it reserved space on the buffer and re-reading the write stamp. b) The event was interrupted by another event. In case a, the write stamp is that of the last event that interrupted this event between taking the time stamps and reserving. As no event came in after re-reading the write stamp, that event is known to be the time of the event directly before this event and the delta can be the new time stamp and the write stamp. In case b, one or more events came in between reserving the event and re-reading he write stamp. Since this event's buffer reservation is between other events at this path, there's no way to know what the delta is. But because an event interrupted this event after it started, its fine to just give a zero delta, and take the same time stamp as the events that happened within the event being recorded. Here's the implementation of the design of this solution: All this is per cpu, and only needs to worry about nested events (not parallel events). The players: write_tail: The index in the buffer where new events can be written to. It is incremented via local_add() to reserve space for a new event. before_stamp: A time stamp set by all events before reserving space. write_stamp: A time stamp updated by events after it has successfully reserved space. /* Save the current position of write */ [A] w = local_read(write_tail); barrier(); /* Read both before and write stamps before touching anything */ before = local_read(before_stamp); after = local_read(write_stamp); barrier(); /* * If before and after are the same, then this event is not * interrupting a time update. If it is, then reserve space for adding * a full time stamp (this can turn into a time extend which is * just an extended time delta but fill up the extra space). */ if (after != before) abs = true; ts = clock(); /* Now update the before_stamp (everyone does this!) */ [B] local_set(before_stamp, ts); /* Now reserve space on the buffer */ [C] write = local_add_return(len, write_tail); /* Set tail to be were this event's data is */ tail = write - len; if (w == tail) { /* Nothing interrupted this between A and C */ [D] local_set(write_stamp, ts); barrier(); [E] save_before = local_read(before_stamp); if (!abs) { /* This did not interrupt a time update */ delta = ts - after; } else { delta = ts; /* The full time stamp will be in use */ } if (ts != save_before) { /* slow path - Was interrupted between C and E */ /* The update to write_stamp could have overwritten the update to * it by the interrupting event, but before and after should be * the same for all completed top events */ after = local_read(write_stamp); if (save_before > after) local_cmpxchg(write_stamp, after, save_before); } } else { /* slow path - Interrupted between A and C */ after = local_read(write_stamp); temp_ts = clock(); barrier(); [F] if (write == local_read(write_tail) && after < temp_ts) { /* This was not interrupted since C and F * The last write_stamp is still valid for the previous event * in the buffer. */ delta = temp_ts - after; /* OK to keep this new time stamp */ ts = temp_ts; } else { /* Interrupted between C and F * Well, there's no use to try to know what the time stamp * is for the previous event. Just set delta to zero and * be the same time as that event that interrupted us before * the reservation of the buffer. */ delta = 0; } /* No need to use full timestamps here */ abs = 0; } Link: https://lkml.kernel.org/r/20200625094454.732790f7@oasis.local.home Link: https://lore.kernel.org/r/20200627010041.517736087@goodmis.org Link: http://lkml.kernel.org/r/20200629025258.957440797@goodmis.org Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:25 +08:00
info->delta = info->ts;
check_buffer(cpu_buffer, info, tail);
ring-buffer: Have nested events still record running time stamp Up until now, if an event is interrupted while it is recorded by an interrupt, and that interrupt records events, the time of those events will all be the same. This is because events only record the delta of the time since the previous event (or beginning of a page), and to handle updating the time keeping for that of nested events is extremely racy. After years of thinking about this and several failed attempts, I finally have a solution to solve this puzzle. The problem is that you need to atomically calculate the delta and then update the time stamp you made the delta from, as well as then record it into the buffer, all this while at any time an interrupt can come in and do the same thing. This is easy to solve with heavy weight atomics, but that would be detrimental to the performance of the ring buffer. The current state of affairs sacrificed the time deltas for nested events for performance. The reason for previous failed attempts at solving this puzzle was because I was trying to completely avoid slow atomic operations like cmpxchg. I final came to the conclusion to always avoid cmpxchg is not possible, which is why those previous attempts always failed. But it is possible to pick one path (the most common case) and avoid cmpxchg in that path, which is the "fast path". The most common case is that an event will not be interrupted and have other events added into it. An event can detect if it has interrupted another event, and for these cases we can make it the slow path and use the heavy operations like cmpxchg. One more player was added to the game that made this possible, and that is the "absolute timestamp" (by Tom Zanussi) that allows us to inject a full 59 bit time stamp. (Of course this breaks if a machine is running for more than 18 years without a reboot!). There's barrier() placements around for being paranoid, even when they are not needed because of other atomic functions near by. But those should not hurt, as if they are not needed, they basically become a nop. Note, this also makes the race window much smaller, which means there are less slow paths to slow down the performance. The basic idea is that there's two main paths taken. 1) Not being interrupted between time stamps and reserving buffer space. In this case, the time stamps taken are true to the location in the buffer. 2) Was interrupted by another path between taking time stamps and reserving buffer space. The objective is to know what the delta is from the last reserved location in the buffer. As it is possible to detect if an event is interrupting another event before reserving data, space is added to the length to be reserved to inject a full time stamp along with the event being reserved. When an event is not interrupted, the write stamp is always the time of the last event written to the buffer. In path 1, there's two sub paths we care about: a) The event did not interrupt another event. b) The event interrupted another event. In case a, as the write stamp was read and known to be correct, the delta between the current time stamp and the write stamp is the delta between the current event and the previously recorded event. In case b, extra space was reserved to just put the full time stamp into the buffer. Which is done, as stated, in this path the time stamp taken is known to match the location in the buffer. In path 2, there's also two sub paths we care about: a) The event was not interrupted by another event since it reserved space on the buffer and re-reading the write stamp. b) The event was interrupted by another event. In case a, the write stamp is that of the last event that interrupted this event between taking the time stamps and reserving. As no event came in after re-reading the write stamp, that event is known to be the time of the event directly before this event and the delta can be the new time stamp and the write stamp. In case b, one or more events came in between reserving the event and re-reading he write stamp. Since this event's buffer reservation is between other events at this path, there's no way to know what the delta is. But because an event interrupted this event after it started, its fine to just give a zero delta, and take the same time stamp as the events that happened within the event being recorded. Here's the implementation of the design of this solution: All this is per cpu, and only needs to worry about nested events (not parallel events). The players: write_tail: The index in the buffer where new events can be written to. It is incremented via local_add() to reserve space for a new event. before_stamp: A time stamp set by all events before reserving space. write_stamp: A time stamp updated by events after it has successfully reserved space. /* Save the current position of write */ [A] w = local_read(write_tail); barrier(); /* Read both before and write stamps before touching anything */ before = local_read(before_stamp); after = local_read(write_stamp); barrier(); /* * If before and after are the same, then this event is not * interrupting a time update. If it is, then reserve space for adding * a full time stamp (this can turn into a time extend which is * just an extended time delta but fill up the extra space). */ if (after != before) abs = true; ts = clock(); /* Now update the before_stamp (everyone does this!) */ [B] local_set(before_stamp, ts); /* Now reserve space on the buffer */ [C] write = local_add_return(len, write_tail); /* Set tail to be were this event's data is */ tail = write - len; if (w == tail) { /* Nothing interrupted this between A and C */ [D] local_set(write_stamp, ts); barrier(); [E] save_before = local_read(before_stamp); if (!abs) { /* This did not interrupt a time update */ delta = ts - after; } else { delta = ts; /* The full time stamp will be in use */ } if (ts != save_before) { /* slow path - Was interrupted between C and E */ /* The update to write_stamp could have overwritten the update to * it by the interrupting event, but before and after should be * the same for all completed top events */ after = local_read(write_stamp); if (save_before > after) local_cmpxchg(write_stamp, after, save_before); } } else { /* slow path - Interrupted between A and C */ after = local_read(write_stamp); temp_ts = clock(); barrier(); [F] if (write == local_read(write_tail) && after < temp_ts) { /* This was not interrupted since C and F * The last write_stamp is still valid for the previous event * in the buffer. */ delta = temp_ts - after; /* OK to keep this new time stamp */ ts = temp_ts; } else { /* Interrupted between C and F * Well, there's no use to try to know what the time stamp * is for the previous event. Just set delta to zero and * be the same time as that event that interrupted us before * the reservation of the buffer. */ delta = 0; } /* No need to use full timestamps here */ abs = 0; } Link: https://lkml.kernel.org/r/20200625094454.732790f7@oasis.local.home Link: https://lore.kernel.org/r/20200627010041.517736087@goodmis.org Link: http://lkml.kernel.org/r/20200629025258.957440797@goodmis.org Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:25 +08:00
} else {
u64 ts;
/* SLOW PATH - Interrupted between A and C */
ring-buffer: Fix slowpath of interrupted event To synchronize the timestamps with the ring buffer reservation, there are two timestamps that are saved in the buffer meta data. 1. before_stamp 2. write_stamp When the two are equal, the write_stamp is considered valid, as in, it may be used to calculate the delta of the next event as the write_stamp is the timestamp of the previous reserved event on the buffer. This is done by the following: /*A*/ w = current position on the ring buffer before = before_stamp after = write_stamp ts = read current timestamp if (before != after) { write_stamp is not valid, force adding an absolute timestamp. } /*B*/ before_stamp = ts /*C*/ write = local_add_return(event length, position on ring buffer) if (w == write - event length) { /* Nothing interrupted between A and C */ /*E*/ write_stamp = ts; delta = ts - after /* * If nothing interrupted again, * before_stamp == write_stamp and write_stamp * can be used to calculate the delta for * events that come in after this one. */ } else { /* * The slow path! * Was interrupted between A and C. */ This is the place that there's a bug. We currently have: after = write_stamp ts = read current timestamp /*F*/ if (write == current position on the ring buffer && after < ts && cmpxchg(write_stamp, after, ts)) { delta = ts - after; } else { delta = 0; } The assumption is that if the current position on the ring buffer hasn't moved between C and F, then it also was not interrupted, and that the last event written has a timestamp that matches the write_stamp. That is the write_stamp is valid. But this may not be the case: If a task context event was interrupted by softirq between B and C. And the softirq wrote an event that got interrupted by a hard irq between C and E. and the hard irq wrote an event (does not need to be interrupted) We have: /*B*/ before_stamp = ts of normal context ---> interrupted by softirq /*B*/ before_stamp = ts of softirq context ---> interrupted by hardirq /*B*/ before_stamp = ts of hard irq context /*E*/ write_stamp = ts of hard irq context /* matches and write_stamp valid */ <---- /*E*/ write_stamp = ts of softirq context /* No longer matches before_stamp, write_stamp is not valid! */ <--- w != write - length, go to slow path // Right now the order of events in the ring buffer is: // // |-- softirq event --|-- hard irq event --|-- normal context event --| // after = write_stamp (this is the ts of softirq) ts = read current timestamp if (write == current position on the ring buffer [true] && after < ts [true] && cmpxchg(write_stamp, after, ts) [true]) { delta = ts - after [Wrong!] The delta is to be between the hard irq event and the normal context event, but the above logic made the delta between the softirq event and the normal context event, where the hard irq event is between the two. This will shift all the remaining event timestamps on the sub-buffer incorrectly. The write_stamp is only valid if it matches the before_stamp. The cmpxchg does nothing to help this. Instead, the following logic can be done to fix this: before = before_stamp ts = read current timestamp before_stamp = ts after = write_stamp if (write == current position on the ring buffer && after == before && after < ts) { delta = ts - after } else { delta = 0; } The above will only use the write_stamp if it still matches before_stamp and was tested to not have changed since C. As a bonus, with this logic we do not need any 64-bit cmpxchg() at all! This means the 32-bit rb_time_t workaround can finally be removed. But that's for a later time. Link: https://lore.kernel.org/linux-trace-kernel/20231218175229.58ec3daf@gandalf.local.home/ Link: https://lore.kernel.org/linux-trace-kernel/20231218230712.3a76b081@gandalf.local.home Cc: stable@vger.kernel.org Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Fixes: dd93942570789 ("ring-buffer: Do not try to put back write_stamp") Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-12-19 12:07:12 +08:00
/* Save the old before_stamp */
a_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
ring-buffer: Add rb_time_t 64 bit operations for speeding up 32 bit After a discussion with the new time algorithm to have nested events still have proper time keeping but required using local64_t atomic operations. Mathieu was concerned about the performance this would have on 32 bit machines, as in most cases, atomic 64 bit operations on them can be expensive. As the ring buffer's timing needs do not require full features of local64_t, a wrapper is made to implement a new rb_time_t operation that uses two longs on 32 bit machines but still uses the local64_t operations on 64 bit machines. There's a switch that can be made in the file to force 64 bit to use the 32 bit version just for testing purposes. All reads do not need to succeed if a read happened while the stamp being read is in the process of being updated. The requirement is that all reads must succed that were done by an interrupting event (where this event was interrupted by another event that did the write). Or if the event itself did the write first. That is: rb_time_set(t, x) followed by rb_time_read(t) will always succeed (even if it gets interrupted by another event that writes to t. The result of the read will be either the previous set, or a set performed by an interrupting event. If the read is done by an event that interrupted another event that was in the process of setting the time stamp, and no other event came along to write to that time stamp, it will fail and the rb_time_read() will return that it failed (the value to read will be undefined). A set will always write to the time stamp and return with a valid time stamp, such that any read after it will be valid. A cmpxchg may fail if it interrupted an event that was in the process of updating the time stamp just like the reads do. Other than that, it will act like a normal cmpxchg. The way this works is that the rb_time_t is made of of three fields. A cnt, that gets updated atomically everyting a modification is made. A top that represents the most significant 30 bits of the time, and a bottom to represent the least significant 30 bits of the time. Notice, that the time values is only 60 bits long (where the ring buffer only uses 59 bits, which gives us 18 years of nanoseconds!). The top two bits of both the top and bottom is a 2 bit counter that gets set by the value of the least two significant bits of the cnt. A read of the top and the bottom where both the top and bottom have the same most significant top 2 bits, are considered a match and a valid 60 bit number can be created from it. If they do not match, then the number is considered invalid, and this must only happen if an event interrupted another event in the midst of updating the time stamp. This is only used for 32 bits machines as 64 bit machines can get better performance out of the local64_t. This has been tested heavily by forcing 64 bit to use this logic. Link: https://lore.kernel.org/r/20200625225345.18cf5881@oasis.local.home Link: http://lkml.kernel.org/r/20200629025259.309232719@goodmis.org Inspired-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:27 +08:00
RB_WARN_ON(cpu_buffer, !a_ok);
ring-buffer: Fix slowpath of interrupted event To synchronize the timestamps with the ring buffer reservation, there are two timestamps that are saved in the buffer meta data. 1. before_stamp 2. write_stamp When the two are equal, the write_stamp is considered valid, as in, it may be used to calculate the delta of the next event as the write_stamp is the timestamp of the previous reserved event on the buffer. This is done by the following: /*A*/ w = current position on the ring buffer before = before_stamp after = write_stamp ts = read current timestamp if (before != after) { write_stamp is not valid, force adding an absolute timestamp. } /*B*/ before_stamp = ts /*C*/ write = local_add_return(event length, position on ring buffer) if (w == write - event length) { /* Nothing interrupted between A and C */ /*E*/ write_stamp = ts; delta = ts - after /* * If nothing interrupted again, * before_stamp == write_stamp and write_stamp * can be used to calculate the delta for * events that come in after this one. */ } else { /* * The slow path! * Was interrupted between A and C. */ This is the place that there's a bug. We currently have: after = write_stamp ts = read current timestamp /*F*/ if (write == current position on the ring buffer && after < ts && cmpxchg(write_stamp, after, ts)) { delta = ts - after; } else { delta = 0; } The assumption is that if the current position on the ring buffer hasn't moved between C and F, then it also was not interrupted, and that the last event written has a timestamp that matches the write_stamp. That is the write_stamp is valid. But this may not be the case: If a task context event was interrupted by softirq between B and C. And the softirq wrote an event that got interrupted by a hard irq between C and E. and the hard irq wrote an event (does not need to be interrupted) We have: /*B*/ before_stamp = ts of normal context ---> interrupted by softirq /*B*/ before_stamp = ts of softirq context ---> interrupted by hardirq /*B*/ before_stamp = ts of hard irq context /*E*/ write_stamp = ts of hard irq context /* matches and write_stamp valid */ <---- /*E*/ write_stamp = ts of softirq context /* No longer matches before_stamp, write_stamp is not valid! */ <--- w != write - length, go to slow path // Right now the order of events in the ring buffer is: // // |-- softirq event --|-- hard irq event --|-- normal context event --| // after = write_stamp (this is the ts of softirq) ts = read current timestamp if (write == current position on the ring buffer [true] && after < ts [true] && cmpxchg(write_stamp, after, ts) [true]) { delta = ts - after [Wrong!] The delta is to be between the hard irq event and the normal context event, but the above logic made the delta between the softirq event and the normal context event, where the hard irq event is between the two. This will shift all the remaining event timestamps on the sub-buffer incorrectly. The write_stamp is only valid if it matches the before_stamp. The cmpxchg does nothing to help this. Instead, the following logic can be done to fix this: before = before_stamp ts = read current timestamp before_stamp = ts after = write_stamp if (write == current position on the ring buffer && after == before && after < ts) { delta = ts - after } else { delta = 0; } The above will only use the write_stamp if it still matches before_stamp and was tested to not have changed since C. As a bonus, with this logic we do not need any 64-bit cmpxchg() at all! This means the 32-bit rb_time_t workaround can finally be removed. But that's for a later time. Link: https://lore.kernel.org/linux-trace-kernel/20231218175229.58ec3daf@gandalf.local.home/ Link: https://lore.kernel.org/linux-trace-kernel/20231218230712.3a76b081@gandalf.local.home Cc: stable@vger.kernel.org Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Fixes: dd93942570789 ("ring-buffer: Do not try to put back write_stamp") Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-12-19 12:07:12 +08:00
/*
* Read a new timestamp and update the before_stamp to make
* the next event after this one force using an absolute
* timestamp. This is in case an interrupt were to come in
* between E and F.
*/
ring-buffer: Have nested events still record running time stamp Up until now, if an event is interrupted while it is recorded by an interrupt, and that interrupt records events, the time of those events will all be the same. This is because events only record the delta of the time since the previous event (or beginning of a page), and to handle updating the time keeping for that of nested events is extremely racy. After years of thinking about this and several failed attempts, I finally have a solution to solve this puzzle. The problem is that you need to atomically calculate the delta and then update the time stamp you made the delta from, as well as then record it into the buffer, all this while at any time an interrupt can come in and do the same thing. This is easy to solve with heavy weight atomics, but that would be detrimental to the performance of the ring buffer. The current state of affairs sacrificed the time deltas for nested events for performance. The reason for previous failed attempts at solving this puzzle was because I was trying to completely avoid slow atomic operations like cmpxchg. I final came to the conclusion to always avoid cmpxchg is not possible, which is why those previous attempts always failed. But it is possible to pick one path (the most common case) and avoid cmpxchg in that path, which is the "fast path". The most common case is that an event will not be interrupted and have other events added into it. An event can detect if it has interrupted another event, and for these cases we can make it the slow path and use the heavy operations like cmpxchg. One more player was added to the game that made this possible, and that is the "absolute timestamp" (by Tom Zanussi) that allows us to inject a full 59 bit time stamp. (Of course this breaks if a machine is running for more than 18 years without a reboot!). There's barrier() placements around for being paranoid, even when they are not needed because of other atomic functions near by. But those should not hurt, as if they are not needed, they basically become a nop. Note, this also makes the race window much smaller, which means there are less slow paths to slow down the performance. The basic idea is that there's two main paths taken. 1) Not being interrupted between time stamps and reserving buffer space. In this case, the time stamps taken are true to the location in the buffer. 2) Was interrupted by another path between taking time stamps and reserving buffer space. The objective is to know what the delta is from the last reserved location in the buffer. As it is possible to detect if an event is interrupting another event before reserving data, space is added to the length to be reserved to inject a full time stamp along with the event being reserved. When an event is not interrupted, the write stamp is always the time of the last event written to the buffer. In path 1, there's two sub paths we care about: a) The event did not interrupt another event. b) The event interrupted another event. In case a, as the write stamp was read and known to be correct, the delta between the current time stamp and the write stamp is the delta between the current event and the previously recorded event. In case b, extra space was reserved to just put the full time stamp into the buffer. Which is done, as stated, in this path the time stamp taken is known to match the location in the buffer. In path 2, there's also two sub paths we care about: a) The event was not interrupted by another event since it reserved space on the buffer and re-reading the write stamp. b) The event was interrupted by another event. In case a, the write stamp is that of the last event that interrupted this event between taking the time stamps and reserving. As no event came in after re-reading the write stamp, that event is known to be the time of the event directly before this event and the delta can be the new time stamp and the write stamp. In case b, one or more events came in between reserving the event and re-reading he write stamp. Since this event's buffer reservation is between other events at this path, there's no way to know what the delta is. But because an event interrupted this event after it started, its fine to just give a zero delta, and take the same time stamp as the events that happened within the event being recorded. Here's the implementation of the design of this solution: All this is per cpu, and only needs to worry about nested events (not parallel events). The players: write_tail: The index in the buffer where new events can be written to. It is incremented via local_add() to reserve space for a new event. before_stamp: A time stamp set by all events before reserving space. write_stamp: A time stamp updated by events after it has successfully reserved space. /* Save the current position of write */ [A] w = local_read(write_tail); barrier(); /* Read both before and write stamps before touching anything */ before = local_read(before_stamp); after = local_read(write_stamp); barrier(); /* * If before and after are the same, then this event is not * interrupting a time update. If it is, then reserve space for adding * a full time stamp (this can turn into a time extend which is * just an extended time delta but fill up the extra space). */ if (after != before) abs = true; ts = clock(); /* Now update the before_stamp (everyone does this!) */ [B] local_set(before_stamp, ts); /* Now reserve space on the buffer */ [C] write = local_add_return(len, write_tail); /* Set tail to be were this event's data is */ tail = write - len; if (w == tail) { /* Nothing interrupted this between A and C */ [D] local_set(write_stamp, ts); barrier(); [E] save_before = local_read(before_stamp); if (!abs) { /* This did not interrupt a time update */ delta = ts - after; } else { delta = ts; /* The full time stamp will be in use */ } if (ts != save_before) { /* slow path - Was interrupted between C and E */ /* The update to write_stamp could have overwritten the update to * it by the interrupting event, but before and after should be * the same for all completed top events */ after = local_read(write_stamp); if (save_before > after) local_cmpxchg(write_stamp, after, save_before); } } else { /* slow path - Interrupted between A and C */ after = local_read(write_stamp); temp_ts = clock(); barrier(); [F] if (write == local_read(write_tail) && after < temp_ts) { /* This was not interrupted since C and F * The last write_stamp is still valid for the previous event * in the buffer. */ delta = temp_ts - after; /* OK to keep this new time stamp */ ts = temp_ts; } else { /* Interrupted between C and F * Well, there's no use to try to know what the time stamp * is for the previous event. Just set delta to zero and * be the same time as that event that interrupted us before * the reservation of the buffer. */ delta = 0; } /* No need to use full timestamps here */ abs = 0; } Link: https://lkml.kernel.org/r/20200625094454.732790f7@oasis.local.home Link: https://lore.kernel.org/r/20200627010041.517736087@goodmis.org Link: http://lkml.kernel.org/r/20200629025258.957440797@goodmis.org Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:25 +08:00
ts = rb_time_stamp(cpu_buffer->buffer);
ring-buffer: Fix slowpath of interrupted event To synchronize the timestamps with the ring buffer reservation, there are two timestamps that are saved in the buffer meta data. 1. before_stamp 2. write_stamp When the two are equal, the write_stamp is considered valid, as in, it may be used to calculate the delta of the next event as the write_stamp is the timestamp of the previous reserved event on the buffer. This is done by the following: /*A*/ w = current position on the ring buffer before = before_stamp after = write_stamp ts = read current timestamp if (before != after) { write_stamp is not valid, force adding an absolute timestamp. } /*B*/ before_stamp = ts /*C*/ write = local_add_return(event length, position on ring buffer) if (w == write - event length) { /* Nothing interrupted between A and C */ /*E*/ write_stamp = ts; delta = ts - after /* * If nothing interrupted again, * before_stamp == write_stamp and write_stamp * can be used to calculate the delta for * events that come in after this one. */ } else { /* * The slow path! * Was interrupted between A and C. */ This is the place that there's a bug. We currently have: after = write_stamp ts = read current timestamp /*F*/ if (write == current position on the ring buffer && after < ts && cmpxchg(write_stamp, after, ts)) { delta = ts - after; } else { delta = 0; } The assumption is that if the current position on the ring buffer hasn't moved between C and F, then it also was not interrupted, and that the last event written has a timestamp that matches the write_stamp. That is the write_stamp is valid. But this may not be the case: If a task context event was interrupted by softirq between B and C. And the softirq wrote an event that got interrupted by a hard irq between C and E. and the hard irq wrote an event (does not need to be interrupted) We have: /*B*/ before_stamp = ts of normal context ---> interrupted by softirq /*B*/ before_stamp = ts of softirq context ---> interrupted by hardirq /*B*/ before_stamp = ts of hard irq context /*E*/ write_stamp = ts of hard irq context /* matches and write_stamp valid */ <---- /*E*/ write_stamp = ts of softirq context /* No longer matches before_stamp, write_stamp is not valid! */ <--- w != write - length, go to slow path // Right now the order of events in the ring buffer is: // // |-- softirq event --|-- hard irq event --|-- normal context event --| // after = write_stamp (this is the ts of softirq) ts = read current timestamp if (write == current position on the ring buffer [true] && after < ts [true] && cmpxchg(write_stamp, after, ts) [true]) { delta = ts - after [Wrong!] The delta is to be between the hard irq event and the normal context event, but the above logic made the delta between the softirq event and the normal context event, where the hard irq event is between the two. This will shift all the remaining event timestamps on the sub-buffer incorrectly. The write_stamp is only valid if it matches the before_stamp. The cmpxchg does nothing to help this. Instead, the following logic can be done to fix this: before = before_stamp ts = read current timestamp before_stamp = ts after = write_stamp if (write == current position on the ring buffer && after == before && after < ts) { delta = ts - after } else { delta = 0; } The above will only use the write_stamp if it still matches before_stamp and was tested to not have changed since C. As a bonus, with this logic we do not need any 64-bit cmpxchg() at all! This means the 32-bit rb_time_t workaround can finally be removed. But that's for a later time. Link: https://lore.kernel.org/linux-trace-kernel/20231218175229.58ec3daf@gandalf.local.home/ Link: https://lore.kernel.org/linux-trace-kernel/20231218230712.3a76b081@gandalf.local.home Cc: stable@vger.kernel.org Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Fixes: dd93942570789 ("ring-buffer: Do not try to put back write_stamp") Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-12-19 12:07:12 +08:00
rb_time_set(&cpu_buffer->before_stamp, ts);
barrier();
/*E*/ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
/* Was interrupted before here, write_stamp must be valid */
RB_WARN_ON(cpu_buffer, !a_ok);
ring-buffer: Have nested events still record running time stamp Up until now, if an event is interrupted while it is recorded by an interrupt, and that interrupt records events, the time of those events will all be the same. This is because events only record the delta of the time since the previous event (or beginning of a page), and to handle updating the time keeping for that of nested events is extremely racy. After years of thinking about this and several failed attempts, I finally have a solution to solve this puzzle. The problem is that you need to atomically calculate the delta and then update the time stamp you made the delta from, as well as then record it into the buffer, all this while at any time an interrupt can come in and do the same thing. This is easy to solve with heavy weight atomics, but that would be detrimental to the performance of the ring buffer. The current state of affairs sacrificed the time deltas for nested events for performance. The reason for previous failed attempts at solving this puzzle was because I was trying to completely avoid slow atomic operations like cmpxchg. I final came to the conclusion to always avoid cmpxchg is not possible, which is why those previous attempts always failed. But it is possible to pick one path (the most common case) and avoid cmpxchg in that path, which is the "fast path". The most common case is that an event will not be interrupted and have other events added into it. An event can detect if it has interrupted another event, and for these cases we can make it the slow path and use the heavy operations like cmpxchg. One more player was added to the game that made this possible, and that is the "absolute timestamp" (by Tom Zanussi) that allows us to inject a full 59 bit time stamp. (Of course this breaks if a machine is running for more than 18 years without a reboot!). There's barrier() placements around for being paranoid, even when they are not needed because of other atomic functions near by. But those should not hurt, as if they are not needed, they basically become a nop. Note, this also makes the race window much smaller, which means there are less slow paths to slow down the performance. The basic idea is that there's two main paths taken. 1) Not being interrupted between time stamps and reserving buffer space. In this case, the time stamps taken are true to the location in the buffer. 2) Was interrupted by another path between taking time stamps and reserving buffer space. The objective is to know what the delta is from the last reserved location in the buffer. As it is possible to detect if an event is interrupting another event before reserving data, space is added to the length to be reserved to inject a full time stamp along with the event being reserved. When an event is not interrupted, the write stamp is always the time of the last event written to the buffer. In path 1, there's two sub paths we care about: a) The event did not interrupt another event. b) The event interrupted another event. In case a, as the write stamp was read and known to be correct, the delta between the current time stamp and the write stamp is the delta between the current event and the previously recorded event. In case b, extra space was reserved to just put the full time stamp into the buffer. Which is done, as stated, in this path the time stamp taken is known to match the location in the buffer. In path 2, there's also two sub paths we care about: a) The event was not interrupted by another event since it reserved space on the buffer and re-reading the write stamp. b) The event was interrupted by another event. In case a, the write stamp is that of the last event that interrupted this event between taking the time stamps and reserving. As no event came in after re-reading the write stamp, that event is known to be the time of the event directly before this event and the delta can be the new time stamp and the write stamp. In case b, one or more events came in between reserving the event and re-reading he write stamp. Since this event's buffer reservation is between other events at this path, there's no way to know what the delta is. But because an event interrupted this event after it started, its fine to just give a zero delta, and take the same time stamp as the events that happened within the event being recorded. Here's the implementation of the design of this solution: All this is per cpu, and only needs to worry about nested events (not parallel events). The players: write_tail: The index in the buffer where new events can be written to. It is incremented via local_add() to reserve space for a new event. before_stamp: A time stamp set by all events before reserving space. write_stamp: A time stamp updated by events after it has successfully reserved space. /* Save the current position of write */ [A] w = local_read(write_tail); barrier(); /* Read both before and write stamps before touching anything */ before = local_read(before_stamp); after = local_read(write_stamp); barrier(); /* * If before and after are the same, then this event is not * interrupting a time update. If it is, then reserve space for adding * a full time stamp (this can turn into a time extend which is * just an extended time delta but fill up the extra space). */ if (after != before) abs = true; ts = clock(); /* Now update the before_stamp (everyone does this!) */ [B] local_set(before_stamp, ts); /* Now reserve space on the buffer */ [C] write = local_add_return(len, write_tail); /* Set tail to be were this event's data is */ tail = write - len; if (w == tail) { /* Nothing interrupted this between A and C */ [D] local_set(write_stamp, ts); barrier(); [E] save_before = local_read(before_stamp); if (!abs) { /* This did not interrupt a time update */ delta = ts - after; } else { delta = ts; /* The full time stamp will be in use */ } if (ts != save_before) { /* slow path - Was interrupted between C and E */ /* The update to write_stamp could have overwritten the update to * it by the interrupting event, but before and after should be * the same for all completed top events */ after = local_read(write_stamp); if (save_before > after) local_cmpxchg(write_stamp, after, save_before); } } else { /* slow path - Interrupted between A and C */ after = local_read(write_stamp); temp_ts = clock(); barrier(); [F] if (write == local_read(write_tail) && after < temp_ts) { /* This was not interrupted since C and F * The last write_stamp is still valid for the previous event * in the buffer. */ delta = temp_ts - after; /* OK to keep this new time stamp */ ts = temp_ts; } else { /* Interrupted between C and F * Well, there's no use to try to know what the time stamp * is for the previous event. Just set delta to zero and * be the same time as that event that interrupted us before * the reservation of the buffer. */ delta = 0; } /* No need to use full timestamps here */ abs = 0; } Link: https://lkml.kernel.org/r/20200625094454.732790f7@oasis.local.home Link: https://lore.kernel.org/r/20200627010041.517736087@goodmis.org Link: http://lkml.kernel.org/r/20200629025258.957440797@goodmis.org Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:25 +08:00
barrier();
ring-buffer: Fix slowpath of interrupted event To synchronize the timestamps with the ring buffer reservation, there are two timestamps that are saved in the buffer meta data. 1. before_stamp 2. write_stamp When the two are equal, the write_stamp is considered valid, as in, it may be used to calculate the delta of the next event as the write_stamp is the timestamp of the previous reserved event on the buffer. This is done by the following: /*A*/ w = current position on the ring buffer before = before_stamp after = write_stamp ts = read current timestamp if (before != after) { write_stamp is not valid, force adding an absolute timestamp. } /*B*/ before_stamp = ts /*C*/ write = local_add_return(event length, position on ring buffer) if (w == write - event length) { /* Nothing interrupted between A and C */ /*E*/ write_stamp = ts; delta = ts - after /* * If nothing interrupted again, * before_stamp == write_stamp and write_stamp * can be used to calculate the delta for * events that come in after this one. */ } else { /* * The slow path! * Was interrupted between A and C. */ This is the place that there's a bug. We currently have: after = write_stamp ts = read current timestamp /*F*/ if (write == current position on the ring buffer && after < ts && cmpxchg(write_stamp, after, ts)) { delta = ts - after; } else { delta = 0; } The assumption is that if the current position on the ring buffer hasn't moved between C and F, then it also was not interrupted, and that the last event written has a timestamp that matches the write_stamp. That is the write_stamp is valid. But this may not be the case: If a task context event was interrupted by softirq between B and C. And the softirq wrote an event that got interrupted by a hard irq between C and E. and the hard irq wrote an event (does not need to be interrupted) We have: /*B*/ before_stamp = ts of normal context ---> interrupted by softirq /*B*/ before_stamp = ts of softirq context ---> interrupted by hardirq /*B*/ before_stamp = ts of hard irq context /*E*/ write_stamp = ts of hard irq context /* matches and write_stamp valid */ <---- /*E*/ write_stamp = ts of softirq context /* No longer matches before_stamp, write_stamp is not valid! */ <--- w != write - length, go to slow path // Right now the order of events in the ring buffer is: // // |-- softirq event --|-- hard irq event --|-- normal context event --| // after = write_stamp (this is the ts of softirq) ts = read current timestamp if (write == current position on the ring buffer [true] && after < ts [true] && cmpxchg(write_stamp, after, ts) [true]) { delta = ts - after [Wrong!] The delta is to be between the hard irq event and the normal context event, but the above logic made the delta between the softirq event and the normal context event, where the hard irq event is between the two. This will shift all the remaining event timestamps on the sub-buffer incorrectly. The write_stamp is only valid if it matches the before_stamp. The cmpxchg does nothing to help this. Instead, the following logic can be done to fix this: before = before_stamp ts = read current timestamp before_stamp = ts after = write_stamp if (write == current position on the ring buffer && after == before && after < ts) { delta = ts - after } else { delta = 0; } The above will only use the write_stamp if it still matches before_stamp and was tested to not have changed since C. As a bonus, with this logic we do not need any 64-bit cmpxchg() at all! This means the 32-bit rb_time_t workaround can finally be removed. But that's for a later time. Link: https://lore.kernel.org/linux-trace-kernel/20231218175229.58ec3daf@gandalf.local.home/ Link: https://lore.kernel.org/linux-trace-kernel/20231218230712.3a76b081@gandalf.local.home Cc: stable@vger.kernel.org Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Fixes: dd93942570789 ("ring-buffer: Do not try to put back write_stamp") Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-12-19 12:07:12 +08:00
/*F*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
info->after == info->before && info->after < ts) {
/*
* Nothing came after this event between C and F, it is
* safe to use info->after for the delta as it
* matched info->before and is still valid.
*/
info->delta = ts - info->after;
ring-buffer: Have nested events still record running time stamp Up until now, if an event is interrupted while it is recorded by an interrupt, and that interrupt records events, the time of those events will all be the same. This is because events only record the delta of the time since the previous event (or beginning of a page), and to handle updating the time keeping for that of nested events is extremely racy. After years of thinking about this and several failed attempts, I finally have a solution to solve this puzzle. The problem is that you need to atomically calculate the delta and then update the time stamp you made the delta from, as well as then record it into the buffer, all this while at any time an interrupt can come in and do the same thing. This is easy to solve with heavy weight atomics, but that would be detrimental to the performance of the ring buffer. The current state of affairs sacrificed the time deltas for nested events for performance. The reason for previous failed attempts at solving this puzzle was because I was trying to completely avoid slow atomic operations like cmpxchg. I final came to the conclusion to always avoid cmpxchg is not possible, which is why those previous attempts always failed. But it is possible to pick one path (the most common case) and avoid cmpxchg in that path, which is the "fast path". The most common case is that an event will not be interrupted and have other events added into it. An event can detect if it has interrupted another event, and for these cases we can make it the slow path and use the heavy operations like cmpxchg. One more player was added to the game that made this possible, and that is the "absolute timestamp" (by Tom Zanussi) that allows us to inject a full 59 bit time stamp. (Of course this breaks if a machine is running for more than 18 years without a reboot!). There's barrier() placements around for being paranoid, even when they are not needed because of other atomic functions near by. But those should not hurt, as if they are not needed, they basically become a nop. Note, this also makes the race window much smaller, which means there are less slow paths to slow down the performance. The basic idea is that there's two main paths taken. 1) Not being interrupted between time stamps and reserving buffer space. In this case, the time stamps taken are true to the location in the buffer. 2) Was interrupted by another path between taking time stamps and reserving buffer space. The objective is to know what the delta is from the last reserved location in the buffer. As it is possible to detect if an event is interrupting another event before reserving data, space is added to the length to be reserved to inject a full time stamp along with the event being reserved. When an event is not interrupted, the write stamp is always the time of the last event written to the buffer. In path 1, there's two sub paths we care about: a) The event did not interrupt another event. b) The event interrupted another event. In case a, as the write stamp was read and known to be correct, the delta between the current time stamp and the write stamp is the delta between the current event and the previously recorded event. In case b, extra space was reserved to just put the full time stamp into the buffer. Which is done, as stated, in this path the time stamp taken is known to match the location in the buffer. In path 2, there's also two sub paths we care about: a) The event was not interrupted by another event since it reserved space on the buffer and re-reading the write stamp. b) The event was interrupted by another event. In case a, the write stamp is that of the last event that interrupted this event between taking the time stamps and reserving. As no event came in after re-reading the write stamp, that event is known to be the time of the event directly before this event and the delta can be the new time stamp and the write stamp. In case b, one or more events came in between reserving the event and re-reading he write stamp. Since this event's buffer reservation is between other events at this path, there's no way to know what the delta is. But because an event interrupted this event after it started, its fine to just give a zero delta, and take the same time stamp as the events that happened within the event being recorded. Here's the implementation of the design of this solution: All this is per cpu, and only needs to worry about nested events (not parallel events). The players: write_tail: The index in the buffer where new events can be written to. It is incremented via local_add() to reserve space for a new event. before_stamp: A time stamp set by all events before reserving space. write_stamp: A time stamp updated by events after it has successfully reserved space. /* Save the current position of write */ [A] w = local_read(write_tail); barrier(); /* Read both before and write stamps before touching anything */ before = local_read(before_stamp); after = local_read(write_stamp); barrier(); /* * If before and after are the same, then this event is not * interrupting a time update. If it is, then reserve space for adding * a full time stamp (this can turn into a time extend which is * just an extended time delta but fill up the extra space). */ if (after != before) abs = true; ts = clock(); /* Now update the before_stamp (everyone does this!) */ [B] local_set(before_stamp, ts); /* Now reserve space on the buffer */ [C] write = local_add_return(len, write_tail); /* Set tail to be were this event's data is */ tail = write - len; if (w == tail) { /* Nothing interrupted this between A and C */ [D] local_set(write_stamp, ts); barrier(); [E] save_before = local_read(before_stamp); if (!abs) { /* This did not interrupt a time update */ delta = ts - after; } else { delta = ts; /* The full time stamp will be in use */ } if (ts != save_before) { /* slow path - Was interrupted between C and E */ /* The update to write_stamp could have overwritten the update to * it by the interrupting event, but before and after should be * the same for all completed top events */ after = local_read(write_stamp); if (save_before > after) local_cmpxchg(write_stamp, after, save_before); } } else { /* slow path - Interrupted between A and C */ after = local_read(write_stamp); temp_ts = clock(); barrier(); [F] if (write == local_read(write_tail) && after < temp_ts) { /* This was not interrupted since C and F * The last write_stamp is still valid for the previous event * in the buffer. */ delta = temp_ts - after; /* OK to keep this new time stamp */ ts = temp_ts; } else { /* Interrupted between C and F * Well, there's no use to try to know what the time stamp * is for the previous event. Just set delta to zero and * be the same time as that event that interrupted us before * the reservation of the buffer. */ delta = 0; } /* No need to use full timestamps here */ abs = 0; } Link: https://lkml.kernel.org/r/20200625094454.732790f7@oasis.local.home Link: https://lore.kernel.org/r/20200627010041.517736087@goodmis.org Link: http://lkml.kernel.org/r/20200629025258.957440797@goodmis.org Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:25 +08:00
} else {
/*
ring-buffer: Fix slowpath of interrupted event To synchronize the timestamps with the ring buffer reservation, there are two timestamps that are saved in the buffer meta data. 1. before_stamp 2. write_stamp When the two are equal, the write_stamp is considered valid, as in, it may be used to calculate the delta of the next event as the write_stamp is the timestamp of the previous reserved event on the buffer. This is done by the following: /*A*/ w = current position on the ring buffer before = before_stamp after = write_stamp ts = read current timestamp if (before != after) { write_stamp is not valid, force adding an absolute timestamp. } /*B*/ before_stamp = ts /*C*/ write = local_add_return(event length, position on ring buffer) if (w == write - event length) { /* Nothing interrupted between A and C */ /*E*/ write_stamp = ts; delta = ts - after /* * If nothing interrupted again, * before_stamp == write_stamp and write_stamp * can be used to calculate the delta for * events that come in after this one. */ } else { /* * The slow path! * Was interrupted between A and C. */ This is the place that there's a bug. We currently have: after = write_stamp ts = read current timestamp /*F*/ if (write == current position on the ring buffer && after < ts && cmpxchg(write_stamp, after, ts)) { delta = ts - after; } else { delta = 0; } The assumption is that if the current position on the ring buffer hasn't moved between C and F, then it also was not interrupted, and that the last event written has a timestamp that matches the write_stamp. That is the write_stamp is valid. But this may not be the case: If a task context event was interrupted by softirq between B and C. And the softirq wrote an event that got interrupted by a hard irq between C and E. and the hard irq wrote an event (does not need to be interrupted) We have: /*B*/ before_stamp = ts of normal context ---> interrupted by softirq /*B*/ before_stamp = ts of softirq context ---> interrupted by hardirq /*B*/ before_stamp = ts of hard irq context /*E*/ write_stamp = ts of hard irq context /* matches and write_stamp valid */ <---- /*E*/ write_stamp = ts of softirq context /* No longer matches before_stamp, write_stamp is not valid! */ <--- w != write - length, go to slow path // Right now the order of events in the ring buffer is: // // |-- softirq event --|-- hard irq event --|-- normal context event --| // after = write_stamp (this is the ts of softirq) ts = read current timestamp if (write == current position on the ring buffer [true] && after < ts [true] && cmpxchg(write_stamp, after, ts) [true]) { delta = ts - after [Wrong!] The delta is to be between the hard irq event and the normal context event, but the above logic made the delta between the softirq event and the normal context event, where the hard irq event is between the two. This will shift all the remaining event timestamps on the sub-buffer incorrectly. The write_stamp is only valid if it matches the before_stamp. The cmpxchg does nothing to help this. Instead, the following logic can be done to fix this: before = before_stamp ts = read current timestamp before_stamp = ts after = write_stamp if (write == current position on the ring buffer && after == before && after < ts) { delta = ts - after } else { delta = 0; } The above will only use the write_stamp if it still matches before_stamp and was tested to not have changed since C. As a bonus, with this logic we do not need any 64-bit cmpxchg() at all! This means the 32-bit rb_time_t workaround can finally be removed. But that's for a later time. Link: https://lore.kernel.org/linux-trace-kernel/20231218175229.58ec3daf@gandalf.local.home/ Link: https://lore.kernel.org/linux-trace-kernel/20231218230712.3a76b081@gandalf.local.home Cc: stable@vger.kernel.org Cc: Masami Hiramatsu <mhiramat@kernel.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Fixes: dd93942570789 ("ring-buffer: Do not try to put back write_stamp") Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-12-19 12:07:12 +08:00
* Interrupted between C and F:
ring-buffer: Have nested events still record running time stamp Up until now, if an event is interrupted while it is recorded by an interrupt, and that interrupt records events, the time of those events will all be the same. This is because events only record the delta of the time since the previous event (or beginning of a page), and to handle updating the time keeping for that of nested events is extremely racy. After years of thinking about this and several failed attempts, I finally have a solution to solve this puzzle. The problem is that you need to atomically calculate the delta and then update the time stamp you made the delta from, as well as then record it into the buffer, all this while at any time an interrupt can come in and do the same thing. This is easy to solve with heavy weight atomics, but that would be detrimental to the performance of the ring buffer. The current state of affairs sacrificed the time deltas for nested events for performance. The reason for previous failed attempts at solving this puzzle was because I was trying to completely avoid slow atomic operations like cmpxchg. I final came to the conclusion to always avoid cmpxchg is not possible, which is why those previous attempts always failed. But it is possible to pick one path (the most common case) and avoid cmpxchg in that path, which is the "fast path". The most common case is that an event will not be interrupted and have other events added into it. An event can detect if it has interrupted another event, and for these cases we can make it the slow path and use the heavy operations like cmpxchg. One more player was added to the game that made this possible, and that is the "absolute timestamp" (by Tom Zanussi) that allows us to inject a full 59 bit time stamp. (Of course this breaks if a machine is running for more than 18 years without a reboot!). There's barrier() placements around for being paranoid, even when they are not needed because of other atomic functions near by. But those should not hurt, as if they are not needed, they basically become a nop. Note, this also makes the race window much smaller, which means there are less slow paths to slow down the performance. The basic idea is that there's two main paths taken. 1) Not being interrupted between time stamps and reserving buffer space. In this case, the time stamps taken are true to the location in the buffer. 2) Was interrupted by another path between taking time stamps and reserving buffer space. The objective is to know what the delta is from the last reserved location in the buffer. As it is possible to detect if an event is interrupting another event before reserving data, space is added to the length to be reserved to inject a full time stamp along with the event being reserved. When an event is not interrupted, the write stamp is always the time of the last event written to the buffer. In path 1, there's two sub paths we care about: a) The event did not interrupt another event. b) The event interrupted another event. In case a, as the write stamp was read and known to be correct, the delta between the current time stamp and the write stamp is the delta between the current event and the previously recorded event. In case b, extra space was reserved to just put the full time stamp into the buffer. Which is done, as stated, in this path the time stamp taken is known to match the location in the buffer. In path 2, there's also two sub paths we care about: a) The event was not interrupted by another event since it reserved space on the buffer and re-reading the write stamp. b) The event was interrupted by another event. In case a, the write stamp is that of the last event that interrupted this event between taking the time stamps and reserving. As no event came in after re-reading the write stamp, that event is known to be the time of the event directly before this event and the delta can be the new time stamp and the write stamp. In case b, one or more events came in between reserving the event and re-reading he write stamp. Since this event's buffer reservation is between other events at this path, there's no way to know what the delta is. But because an event interrupted this event after it started, its fine to just give a zero delta, and take the same time stamp as the events that happened within the event being recorded. Here's the implementation of the design of this solution: All this is per cpu, and only needs to worry about nested events (not parallel events). The players: write_tail: The index in the buffer where new events can be written to. It is incremented via local_add() to reserve space for a new event. before_stamp: A time stamp set by all events before reserving space. write_stamp: A time stamp updated by events after it has successfully reserved space. /* Save the current position of write */ [A] w = local_read(write_tail); barrier(); /* Read both before and write stamps before touching anything */ before = local_read(before_stamp); after = local_read(write_stamp); barrier(); /* * If before and after are the same, then this event is not * interrupting a time update. If it is, then reserve space for adding * a full time stamp (this can turn into a time extend which is * just an extended time delta but fill up the extra space). */ if (after != before) abs = true; ts = clock(); /* Now update the before_stamp (everyone does this!) */ [B] local_set(before_stamp, ts); /* Now reserve space on the buffer */ [C] write = local_add_return(len, write_tail); /* Set tail to be were this event's data is */ tail = write - len; if (w == tail) { /* Nothing interrupted this between A and C */ [D] local_set(write_stamp, ts); barrier(); [E] save_before = local_read(before_stamp); if (!abs) { /* This did not interrupt a time update */ delta = ts - after; } else { delta = ts; /* The full time stamp will be in use */ } if (ts != save_before) { /* slow path - Was interrupted between C and E */ /* The update to write_stamp could have overwritten the update to * it by the interrupting event, but before and after should be * the same for all completed top events */ after = local_read(write_stamp); if (save_before > after) local_cmpxchg(write_stamp, after, save_before); } } else { /* slow path - Interrupted between A and C */ after = local_read(write_stamp); temp_ts = clock(); barrier(); [F] if (write == local_read(write_tail) && after < temp_ts) { /* This was not interrupted since C and F * The last write_stamp is still valid for the previous event * in the buffer. */ delta = temp_ts - after; /* OK to keep this new time stamp */ ts = temp_ts; } else { /* Interrupted between C and F * Well, there's no use to try to know what the time stamp * is for the previous event. Just set delta to zero and * be the same time as that event that interrupted us before * the reservation of the buffer. */ delta = 0; } /* No need to use full timestamps here */ abs = 0; } Link: https://lkml.kernel.org/r/20200625094454.732790f7@oasis.local.home Link: https://lore.kernel.org/r/20200627010041.517736087@goodmis.org Link: http://lkml.kernel.org/r/20200629025258.957440797@goodmis.org Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:25 +08:00
* Lost the previous events time stamp. Just set the
* delta to zero, and this will be the same time as
* the event this event interrupted. And the events that
* came after this will still be correct (as they would
* have built their delta on the previous event.
*/
info->delta = 0;
}
info->ts = ts;
info->add_timestamp &= ~RB_ADD_STAMP_FORCE;
ring-buffer: Have nested events still record running time stamp Up until now, if an event is interrupted while it is recorded by an interrupt, and that interrupt records events, the time of those events will all be the same. This is because events only record the delta of the time since the previous event (or beginning of a page), and to handle updating the time keeping for that of nested events is extremely racy. After years of thinking about this and several failed attempts, I finally have a solution to solve this puzzle. The problem is that you need to atomically calculate the delta and then update the time stamp you made the delta from, as well as then record it into the buffer, all this while at any time an interrupt can come in and do the same thing. This is easy to solve with heavy weight atomics, but that would be detrimental to the performance of the ring buffer. The current state of affairs sacrificed the time deltas for nested events for performance. The reason for previous failed attempts at solving this puzzle was because I was trying to completely avoid slow atomic operations like cmpxchg. I final came to the conclusion to always avoid cmpxchg is not possible, which is why those previous attempts always failed. But it is possible to pick one path (the most common case) and avoid cmpxchg in that path, which is the "fast path". The most common case is that an event will not be interrupted and have other events added into it. An event can detect if it has interrupted another event, and for these cases we can make it the slow path and use the heavy operations like cmpxchg. One more player was added to the game that made this possible, and that is the "absolute timestamp" (by Tom Zanussi) that allows us to inject a full 59 bit time stamp. (Of course this breaks if a machine is running for more than 18 years without a reboot!). There's barrier() placements around for being paranoid, even when they are not needed because of other atomic functions near by. But those should not hurt, as if they are not needed, they basically become a nop. Note, this also makes the race window much smaller, which means there are less slow paths to slow down the performance. The basic idea is that there's two main paths taken. 1) Not being interrupted between time stamps and reserving buffer space. In this case, the time stamps taken are true to the location in the buffer. 2) Was interrupted by another path between taking time stamps and reserving buffer space. The objective is to know what the delta is from the last reserved location in the buffer. As it is possible to detect if an event is interrupting another event before reserving data, space is added to the length to be reserved to inject a full time stamp along with the event being reserved. When an event is not interrupted, the write stamp is always the time of the last event written to the buffer. In path 1, there's two sub paths we care about: a) The event did not interrupt another event. b) The event interrupted another event. In case a, as the write stamp was read and known to be correct, the delta between the current time stamp and the write stamp is the delta between the current event and the previously recorded event. In case b, extra space was reserved to just put the full time stamp into the buffer. Which is done, as stated, in this path the time stamp taken is known to match the location in the buffer. In path 2, there's also two sub paths we care about: a) The event was not interrupted by another event since it reserved space on the buffer and re-reading the write stamp. b) The event was interrupted by another event. In case a, the write stamp is that of the last event that interrupted this event between taking the time stamps and reserving. As no event came in after re-reading the write stamp, that event is known to be the time of the event directly before this event and the delta can be the new time stamp and the write stamp. In case b, one or more events came in between reserving the event and re-reading he write stamp. Since this event's buffer reservation is between other events at this path, there's no way to know what the delta is. But because an event interrupted this event after it started, its fine to just give a zero delta, and take the same time stamp as the events that happened within the event being recorded. Here's the implementation of the design of this solution: All this is per cpu, and only needs to worry about nested events (not parallel events). The players: write_tail: The index in the buffer where new events can be written to. It is incremented via local_add() to reserve space for a new event. before_stamp: A time stamp set by all events before reserving space. write_stamp: A time stamp updated by events after it has successfully reserved space. /* Save the current position of write */ [A] w = local_read(write_tail); barrier(); /* Read both before and write stamps before touching anything */ before = local_read(before_stamp); after = local_read(write_stamp); barrier(); /* * If before and after are the same, then this event is not * interrupting a time update. If it is, then reserve space for adding * a full time stamp (this can turn into a time extend which is * just an extended time delta but fill up the extra space). */ if (after != before) abs = true; ts = clock(); /* Now update the before_stamp (everyone does this!) */ [B] local_set(before_stamp, ts); /* Now reserve space on the buffer */ [C] write = local_add_return(len, write_tail); /* Set tail to be were this event's data is */ tail = write - len; if (w == tail) { /* Nothing interrupted this between A and C */ [D] local_set(write_stamp, ts); barrier(); [E] save_before = local_read(before_stamp); if (!abs) { /* This did not interrupt a time update */ delta = ts - after; } else { delta = ts; /* The full time stamp will be in use */ } if (ts != save_before) { /* slow path - Was interrupted between C and E */ /* The update to write_stamp could have overwritten the update to * it by the interrupting event, but before and after should be * the same for all completed top events */ after = local_read(write_stamp); if (save_before > after) local_cmpxchg(write_stamp, after, save_before); } } else { /* slow path - Interrupted between A and C */ after = local_read(write_stamp); temp_ts = clock(); barrier(); [F] if (write == local_read(write_tail) && after < temp_ts) { /* This was not interrupted since C and F * The last write_stamp is still valid for the previous event * in the buffer. */ delta = temp_ts - after; /* OK to keep this new time stamp */ ts = temp_ts; } else { /* Interrupted between C and F * Well, there's no use to try to know what the time stamp * is for the previous event. Just set delta to zero and * be the same time as that event that interrupted us before * the reservation of the buffer. */ delta = 0; } /* No need to use full timestamps here */ abs = 0; } Link: https://lkml.kernel.org/r/20200625094454.732790f7@oasis.local.home Link: https://lore.kernel.org/r/20200627010041.517736087@goodmis.org Link: http://lkml.kernel.org/r/20200629025258.957440797@goodmis.org Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:25 +08:00
}
/*
* If this is the first commit on the page, then it has the same
* timestamp as the page itself.
*/
if (unlikely(!tail && !(info->add_timestamp &
(RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE))))
info->delta = 0;
/* We reserved something on the buffer */
event = __rb_page_index(tail_page, tail);
rb_update_event(cpu_buffer, event, info);
local_inc(&tail_page->entries);
/*
* If this is the first commit on the page, then update
* its timestamp.
*/
if (unlikely(!tail))
tail_page->page->time_stamp = info->ts;
/* account for these added bytes */
local_add(info->length, &cpu_buffer->entries_bytes);
return event;
}
static __always_inline struct ring_buffer_event *
rb_reserve_next_event(struct trace_buffer *buffer,
struct ring_buffer_per_cpu *cpu_buffer,
unsigned long length)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
struct ring_buffer_event *event;
struct rb_event_info info;
int nr_loops = 0;
int add_ts_default;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/* ring buffer does cmpxchg, make sure it is safe in NMI context */
if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) &&
(unlikely(in_nmi()))) {
return NULL;
}
ring-buffer: use commit counters for commit pointer accounting The ring buffer is made up of three sets of pointers. The head page pointer, which points to the next page for the reader to get. The commit pointer and commit index, which points to the page and index of the last committed write respectively. The tail pointer and tail index, which points to the page and the index of the last reserved data respectively (non committed). The commit pointer is only moved forward by the outer most writer. If a nested writer comes in, it will not move the pointer forward. The current implementation has a flaw. It assumes that the outer most writer successfully reserved data. There's a small race window where the outer most writer could find the tail pointer, but a nested writer could come in (via interrupt) and move the tail forward, and even the commit forward. The outer writer would not realized the commit moved forward and the accounting will break. This patch changes the design to use counters in the per cpu buffers to keep track of commits. The counters are incremented at the start of the commit, and decremented at the end. If the end commit counter is 1, then it moves the commit pointers. A loop is made to check for races between checking and moving the commit pointers. Only the outer commit should move the pointers anyway. The test of knowing if a reserve is equal to the last commit update is still needed to know for time keeping. The time code is much less racey than the commit updates. This change not only solves the mentioned race, but also makes the code simpler. [ Impact: fix commit race and simplify code ] Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-06-17 00:37:57 +08:00
rb_start_commit(cpu_buffer);
ring-buffer: Have nested events still record running time stamp Up until now, if an event is interrupted while it is recorded by an interrupt, and that interrupt records events, the time of those events will all be the same. This is because events only record the delta of the time since the previous event (or beginning of a page), and to handle updating the time keeping for that of nested events is extremely racy. After years of thinking about this and several failed attempts, I finally have a solution to solve this puzzle. The problem is that you need to atomically calculate the delta and then update the time stamp you made the delta from, as well as then record it into the buffer, all this while at any time an interrupt can come in and do the same thing. This is easy to solve with heavy weight atomics, but that would be detrimental to the performance of the ring buffer. The current state of affairs sacrificed the time deltas for nested events for performance. The reason for previous failed attempts at solving this puzzle was because I was trying to completely avoid slow atomic operations like cmpxchg. I final came to the conclusion to always avoid cmpxchg is not possible, which is why those previous attempts always failed. But it is possible to pick one path (the most common case) and avoid cmpxchg in that path, which is the "fast path". The most common case is that an event will not be interrupted and have other events added into it. An event can detect if it has interrupted another event, and for these cases we can make it the slow path and use the heavy operations like cmpxchg. One more player was added to the game that made this possible, and that is the "absolute timestamp" (by Tom Zanussi) that allows us to inject a full 59 bit time stamp. (Of course this breaks if a machine is running for more than 18 years without a reboot!). There's barrier() placements around for being paranoid, even when they are not needed because of other atomic functions near by. But those should not hurt, as if they are not needed, they basically become a nop. Note, this also makes the race window much smaller, which means there are less slow paths to slow down the performance. The basic idea is that there's two main paths taken. 1) Not being interrupted between time stamps and reserving buffer space. In this case, the time stamps taken are true to the location in the buffer. 2) Was interrupted by another path between taking time stamps and reserving buffer space. The objective is to know what the delta is from the last reserved location in the buffer. As it is possible to detect if an event is interrupting another event before reserving data, space is added to the length to be reserved to inject a full time stamp along with the event being reserved. When an event is not interrupted, the write stamp is always the time of the last event written to the buffer. In path 1, there's two sub paths we care about: a) The event did not interrupt another event. b) The event interrupted another event. In case a, as the write stamp was read and known to be correct, the delta between the current time stamp and the write stamp is the delta between the current event and the previously recorded event. In case b, extra space was reserved to just put the full time stamp into the buffer. Which is done, as stated, in this path the time stamp taken is known to match the location in the buffer. In path 2, there's also two sub paths we care about: a) The event was not interrupted by another event since it reserved space on the buffer and re-reading the write stamp. b) The event was interrupted by another event. In case a, the write stamp is that of the last event that interrupted this event between taking the time stamps and reserving. As no event came in after re-reading the write stamp, that event is known to be the time of the event directly before this event and the delta can be the new time stamp and the write stamp. In case b, one or more events came in between reserving the event and re-reading he write stamp. Since this event's buffer reservation is between other events at this path, there's no way to know what the delta is. But because an event interrupted this event after it started, its fine to just give a zero delta, and take the same time stamp as the events that happened within the event being recorded. Here's the implementation of the design of this solution: All this is per cpu, and only needs to worry about nested events (not parallel events). The players: write_tail: The index in the buffer where new events can be written to. It is incremented via local_add() to reserve space for a new event. before_stamp: A time stamp set by all events before reserving space. write_stamp: A time stamp updated by events after it has successfully reserved space. /* Save the current position of write */ [A] w = local_read(write_tail); barrier(); /* Read both before and write stamps before touching anything */ before = local_read(before_stamp); after = local_read(write_stamp); barrier(); /* * If before and after are the same, then this event is not * interrupting a time update. If it is, then reserve space for adding * a full time stamp (this can turn into a time extend which is * just an extended time delta but fill up the extra space). */ if (after != before) abs = true; ts = clock(); /* Now update the before_stamp (everyone does this!) */ [B] local_set(before_stamp, ts); /* Now reserve space on the buffer */ [C] write = local_add_return(len, write_tail); /* Set tail to be were this event's data is */ tail = write - len; if (w == tail) { /* Nothing interrupted this between A and C */ [D] local_set(write_stamp, ts); barrier(); [E] save_before = local_read(before_stamp); if (!abs) { /* This did not interrupt a time update */ delta = ts - after; } else { delta = ts; /* The full time stamp will be in use */ } if (ts != save_before) { /* slow path - Was interrupted between C and E */ /* The update to write_stamp could have overwritten the update to * it by the interrupting event, but before and after should be * the same for all completed top events */ after = local_read(write_stamp); if (save_before > after) local_cmpxchg(write_stamp, after, save_before); } } else { /* slow path - Interrupted between A and C */ after = local_read(write_stamp); temp_ts = clock(); barrier(); [F] if (write == local_read(write_tail) && after < temp_ts) { /* This was not interrupted since C and F * The last write_stamp is still valid for the previous event * in the buffer. */ delta = temp_ts - after; /* OK to keep this new time stamp */ ts = temp_ts; } else { /* Interrupted between C and F * Well, there's no use to try to know what the time stamp * is for the previous event. Just set delta to zero and * be the same time as that event that interrupted us before * the reservation of the buffer. */ delta = 0; } /* No need to use full timestamps here */ abs = 0; } Link: https://lkml.kernel.org/r/20200625094454.732790f7@oasis.local.home Link: https://lore.kernel.org/r/20200627010041.517736087@goodmis.org Link: http://lkml.kernel.org/r/20200629025258.957440797@goodmis.org Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:25 +08:00
/* The commit page can not change after this */
ring-buffer: use commit counters for commit pointer accounting The ring buffer is made up of three sets of pointers. The head page pointer, which points to the next page for the reader to get. The commit pointer and commit index, which points to the page and index of the last committed write respectively. The tail pointer and tail index, which points to the page and the index of the last reserved data respectively (non committed). The commit pointer is only moved forward by the outer most writer. If a nested writer comes in, it will not move the pointer forward. The current implementation has a flaw. It assumes that the outer most writer successfully reserved data. There's a small race window where the outer most writer could find the tail pointer, but a nested writer could come in (via interrupt) and move the tail forward, and even the commit forward. The outer writer would not realized the commit moved forward and the accounting will break. This patch changes the design to use counters in the per cpu buffers to keep track of commits. The counters are incremented at the start of the commit, and decremented at the end. If the end commit counter is 1, then it moves the commit pointers. A loop is made to check for races between checking and moving the commit pointers. Only the outer commit should move the pointers anyway. The test of knowing if a reserve is equal to the last commit update is still needed to know for time keeping. The time code is much less racey than the commit updates. This change not only solves the mentioned race, but also makes the code simpler. [ Impact: fix commit race and simplify code ] Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-06-17 00:37:57 +08:00
#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
/*
* Due to the ability to swap a cpu buffer from a buffer
* it is possible it was swapped before we committed.
* (committing stops a swap). We check for it here and
* if it happened, we have to fail the write.
*/
barrier();
locking/atomics: COCCINELLE/treewide: Convert trivial ACCESS_ONCE() patterns to READ_ONCE()/WRITE_ONCE() Please do not apply this to mainline directly, instead please re-run the coccinelle script shown below and apply its output. For several reasons, it is desirable to use {READ,WRITE}_ONCE() in preference to ACCESS_ONCE(), and new code is expected to use one of the former. So far, there's been no reason to change most existing uses of ACCESS_ONCE(), as these aren't harmful, and changing them results in churn. However, for some features, the read/write distinction is critical to correct operation. To distinguish these cases, separate read/write accessors must be used. This patch migrates (most) remaining ACCESS_ONCE() instances to {READ,WRITE}_ONCE(), using the following coccinelle script: ---- // Convert trivial ACCESS_ONCE() uses to equivalent READ_ONCE() and // WRITE_ONCE() // $ make coccicheck COCCI=/home/mark/once.cocci SPFLAGS="--include-headers" MODE=patch virtual patch @ depends on patch @ expression E1, E2; @@ - ACCESS_ONCE(E1) = E2 + WRITE_ONCE(E1, E2) @ depends on patch @ expression E; @@ - ACCESS_ONCE(E) + READ_ONCE(E) ---- Signed-off-by: Mark Rutland <mark.rutland@arm.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: davem@davemloft.net Cc: linux-arch@vger.kernel.org Cc: mpe@ellerman.id.au Cc: shuah@kernel.org Cc: snitzer@redhat.com Cc: thor.thayer@linux.intel.com Cc: tj@kernel.org Cc: viro@zeniv.linux.org.uk Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1508792849-3115-19-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-24 05:07:29 +08:00
if (unlikely(READ_ONCE(cpu_buffer->buffer) != buffer)) {
local_dec(&cpu_buffer->committing);
local_dec(&cpu_buffer->commits);
return NULL;
}
#endif
info.length = rb_calculate_event_length(length);
if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) {
add_ts_default = RB_ADD_STAMP_ABSOLUTE;
info.length += RB_LEN_TIME_EXTEND;
ring-buffer: Fix writing to the buffer with max_data_size The maximum ring buffer data size is the maximum size of data that can be recorded on the ring buffer. Events must be smaller than the sub buffer data size minus any meta data. This size is checked before trying to allocate from the ring buffer because the allocation assumes that the size will fit on the sub buffer. The maximum size was calculated as the size of a sub buffer page (which is currently PAGE_SIZE minus the sub buffer header) minus the size of the meta data of an individual event. But it missed the possible adding of a time stamp for events that are added long enough apart that the event meta data can't hold the time delta. When an event is added that is greater than the current BUF_MAX_DATA_SIZE minus the size of a time stamp, but still less than or equal to BUF_MAX_DATA_SIZE, the ring buffer would go into an infinite loop, looking for a page that can hold the event. Luckily, there's a check for this loop and after 1000 iterations and a warning is emitted and the ring buffer is disabled. But this should never happen. This can happen when a large event is added first, or after a long period where an absolute timestamp is prefixed to the event, increasing its size by 8 bytes. This passes the check and then goes into the algorithm that causes the infinite loop. For events that are the first event on the sub-buffer, it does not need to add a timestamp, because the sub-buffer itself contains an absolute timestamp, and adding one is redundant. The fix is to check if the event is to be the first event on the sub-buffer, and if it is, then do not add a timestamp. This also fixes 32 bit adding a timestamp when a read of before_stamp or write_stamp is interrupted. There's still no need to add that timestamp if the event is going to be the first event on the sub buffer. Also, if the buffer has "time_stamp_abs" set, then also check if the length plus the timestamp is greater than the BUF_MAX_DATA_SIZE. Link: https://lore.kernel.org/all/20231212104549.58863438@gandalf.local.home/ Link: https://lore.kernel.org/linux-trace-kernel/20231212071837.5fdd6c13@gandalf.local.home Link: https://lore.kernel.org/linux-trace-kernel/20231212111617.39e02849@gandalf.local.home Cc: stable@vger.kernel.org Cc: Mark Rutland <mark.rutland@arm.com> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Fixes: a4543a2fa9ef3 ("ring-buffer: Get timestamp after event is allocated") Fixes: 58fbc3c63275c ("ring-buffer: Consolidate add_timestamp to remove some branches") Reported-by: Kent Overstreet <kent.overstreet@linux.dev> # (on IRC) Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-12-13 00:16:17 +08:00
if (info.length > BUF_MAX_DATA_SIZE)
goto out_fail;
} else {
add_ts_default = RB_ADD_STAMP_NONE;
}
again:
info.add_timestamp = add_ts_default;
info.delta = 0;
/*
* We allow for interrupts to reenter here and do a trace.
* If one does, it will cause this original code to loop
* back here. Even with heavy interrupts happening, this
* should only happen a few times in a row. If this happens
* 1000 times in a row, there must be either an interrupt
* storm or we have something buggy.
* Bail!
*/
if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
ring-buffer: use commit counters for commit pointer accounting The ring buffer is made up of three sets of pointers. The head page pointer, which points to the next page for the reader to get. The commit pointer and commit index, which points to the page and index of the last committed write respectively. The tail pointer and tail index, which points to the page and the index of the last reserved data respectively (non committed). The commit pointer is only moved forward by the outer most writer. If a nested writer comes in, it will not move the pointer forward. The current implementation has a flaw. It assumes that the outer most writer successfully reserved data. There's a small race window where the outer most writer could find the tail pointer, but a nested writer could come in (via interrupt) and move the tail forward, and even the commit forward. The outer writer would not realized the commit moved forward and the accounting will break. This patch changes the design to use counters in the per cpu buffers to keep track of commits. The counters are incremented at the start of the commit, and decremented at the end. If the end commit counter is 1, then it moves the commit pointers. A loop is made to check for races between checking and moving the commit pointers. Only the outer commit should move the pointers anyway. The test of knowing if a reserve is equal to the last commit update is still needed to know for time keeping. The time code is much less racey than the commit updates. This change not only solves the mentioned race, but also makes the code simpler. [ Impact: fix commit race and simplify code ] Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-06-17 00:37:57 +08:00
goto out_fail;
event = __rb_reserve_next(cpu_buffer, &info);
if (unlikely(PTR_ERR(event) == -EAGAIN)) {
if (info.add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND))
info.length -= RB_LEN_TIME_EXTEND;
goto again;
}
ring-buffer: Have nested events still record running time stamp Up until now, if an event is interrupted while it is recorded by an interrupt, and that interrupt records events, the time of those events will all be the same. This is because events only record the delta of the time since the previous event (or beginning of a page), and to handle updating the time keeping for that of nested events is extremely racy. After years of thinking about this and several failed attempts, I finally have a solution to solve this puzzle. The problem is that you need to atomically calculate the delta and then update the time stamp you made the delta from, as well as then record it into the buffer, all this while at any time an interrupt can come in and do the same thing. This is easy to solve with heavy weight atomics, but that would be detrimental to the performance of the ring buffer. The current state of affairs sacrificed the time deltas for nested events for performance. The reason for previous failed attempts at solving this puzzle was because I was trying to completely avoid slow atomic operations like cmpxchg. I final came to the conclusion to always avoid cmpxchg is not possible, which is why those previous attempts always failed. But it is possible to pick one path (the most common case) and avoid cmpxchg in that path, which is the "fast path". The most common case is that an event will not be interrupted and have other events added into it. An event can detect if it has interrupted another event, and for these cases we can make it the slow path and use the heavy operations like cmpxchg. One more player was added to the game that made this possible, and that is the "absolute timestamp" (by Tom Zanussi) that allows us to inject a full 59 bit time stamp. (Of course this breaks if a machine is running for more than 18 years without a reboot!). There's barrier() placements around for being paranoid, even when they are not needed because of other atomic functions near by. But those should not hurt, as if they are not needed, they basically become a nop. Note, this also makes the race window much smaller, which means there are less slow paths to slow down the performance. The basic idea is that there's two main paths taken. 1) Not being interrupted between time stamps and reserving buffer space. In this case, the time stamps taken are true to the location in the buffer. 2) Was interrupted by another path between taking time stamps and reserving buffer space. The objective is to know what the delta is from the last reserved location in the buffer. As it is possible to detect if an event is interrupting another event before reserving data, space is added to the length to be reserved to inject a full time stamp along with the event being reserved. When an event is not interrupted, the write stamp is always the time of the last event written to the buffer. In path 1, there's two sub paths we care about: a) The event did not interrupt another event. b) The event interrupted another event. In case a, as the write stamp was read and known to be correct, the delta between the current time stamp and the write stamp is the delta between the current event and the previously recorded event. In case b, extra space was reserved to just put the full time stamp into the buffer. Which is done, as stated, in this path the time stamp taken is known to match the location in the buffer. In path 2, there's also two sub paths we care about: a) The event was not interrupted by another event since it reserved space on the buffer and re-reading the write stamp. b) The event was interrupted by another event. In case a, the write stamp is that of the last event that interrupted this event between taking the time stamps and reserving. As no event came in after re-reading the write stamp, that event is known to be the time of the event directly before this event and the delta can be the new time stamp and the write stamp. In case b, one or more events came in between reserving the event and re-reading he write stamp. Since this event's buffer reservation is between other events at this path, there's no way to know what the delta is. But because an event interrupted this event after it started, its fine to just give a zero delta, and take the same time stamp as the events that happened within the event being recorded. Here's the implementation of the design of this solution: All this is per cpu, and only needs to worry about nested events (not parallel events). The players: write_tail: The index in the buffer where new events can be written to. It is incremented via local_add() to reserve space for a new event. before_stamp: A time stamp set by all events before reserving space. write_stamp: A time stamp updated by events after it has successfully reserved space. /* Save the current position of write */ [A] w = local_read(write_tail); barrier(); /* Read both before and write stamps before touching anything */ before = local_read(before_stamp); after = local_read(write_stamp); barrier(); /* * If before and after are the same, then this event is not * interrupting a time update. If it is, then reserve space for adding * a full time stamp (this can turn into a time extend which is * just an extended time delta but fill up the extra space). */ if (after != before) abs = true; ts = clock(); /* Now update the before_stamp (everyone does this!) */ [B] local_set(before_stamp, ts); /* Now reserve space on the buffer */ [C] write = local_add_return(len, write_tail); /* Set tail to be were this event's data is */ tail = write - len; if (w == tail) { /* Nothing interrupted this between A and C */ [D] local_set(write_stamp, ts); barrier(); [E] save_before = local_read(before_stamp); if (!abs) { /* This did not interrupt a time update */ delta = ts - after; } else { delta = ts; /* The full time stamp will be in use */ } if (ts != save_before) { /* slow path - Was interrupted between C and E */ /* The update to write_stamp could have overwritten the update to * it by the interrupting event, but before and after should be * the same for all completed top events */ after = local_read(write_stamp); if (save_before > after) local_cmpxchg(write_stamp, after, save_before); } } else { /* slow path - Interrupted between A and C */ after = local_read(write_stamp); temp_ts = clock(); barrier(); [F] if (write == local_read(write_tail) && after < temp_ts) { /* This was not interrupted since C and F * The last write_stamp is still valid for the previous event * in the buffer. */ delta = temp_ts - after; /* OK to keep this new time stamp */ ts = temp_ts; } else { /* Interrupted between C and F * Well, there's no use to try to know what the time stamp * is for the previous event. Just set delta to zero and * be the same time as that event that interrupted us before * the reservation of the buffer. */ delta = 0; } /* No need to use full timestamps here */ abs = 0; } Link: https://lkml.kernel.org/r/20200625094454.732790f7@oasis.local.home Link: https://lore.kernel.org/r/20200627010041.517736087@goodmis.org Link: http://lkml.kernel.org/r/20200629025258.957440797@goodmis.org Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:25 +08:00
if (likely(event))
return event;
ring-buffer: use commit counters for commit pointer accounting The ring buffer is made up of three sets of pointers. The head page pointer, which points to the next page for the reader to get. The commit pointer and commit index, which points to the page and index of the last committed write respectively. The tail pointer and tail index, which points to the page and the index of the last reserved data respectively (non committed). The commit pointer is only moved forward by the outer most writer. If a nested writer comes in, it will not move the pointer forward. The current implementation has a flaw. It assumes that the outer most writer successfully reserved data. There's a small race window where the outer most writer could find the tail pointer, but a nested writer could come in (via interrupt) and move the tail forward, and even the commit forward. The outer writer would not realized the commit moved forward and the accounting will break. This patch changes the design to use counters in the per cpu buffers to keep track of commits. The counters are incremented at the start of the commit, and decremented at the end. If the end commit counter is 1, then it moves the commit pointers. A loop is made to check for races between checking and moving the commit pointers. Only the outer commit should move the pointers anyway. The test of knowing if a reserve is equal to the last commit update is still needed to know for time keeping. The time code is much less racey than the commit updates. This change not only solves the mentioned race, but also makes the code simpler. [ Impact: fix commit race and simplify code ] Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-06-17 00:37:57 +08:00
out_fail:
rb_end_commit(cpu_buffer);
return NULL;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
/**
* ring_buffer_lock_reserve - reserve a part of the buffer
* @buffer: the ring buffer to reserve from
* @length: the length of the data to reserve (excluding event header)
*
* Returns a reserved event on the ring buffer to copy directly to.
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
* The user of this interface will need to get the body to write into
* and can use the ring_buffer_event_data() interface.
*
* The length is the length of the data needed, not the event length
* which also includes the event header.
*
* Must be paired with ring_buffer_unlock_commit, unless NULL is returned.
* If NULL is returned, then nothing has been allocated or locked.
*/
struct ring_buffer_event *
ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event;
tracing: Remove ftrace_preempt_disable/enable The ftrace_preempt_disable/enable functions were to address a recursive race caused by the function tracer. The function tracer traces all functions which makes it easily susceptible to recursion. One area was preempt_enable(). This would call the scheduler and the schedulre would call the function tracer and loop. (So was it thought). The ftrace_preempt_disable/enable was made to protect against recursion inside the scheduler by storing the NEED_RESCHED flag. If it was set before the ftrace_preempt_disable() it would not call schedule on ftrace_preempt_enable(), thinking that if it was set before then it would have already scheduled unless it was already in the scheduler. This worked fine except in the case of SMP, where another task would set the NEED_RESCHED flag for a task on another CPU, and then kick off an IPI to trigger it. This could cause the NEED_RESCHED to be saved at ftrace_preempt_disable() but the IPI to arrive in the the preempt disabled section. The ftrace_preempt_enable() would not call the scheduler because the flag was already set before entring the section. This bug would cause a missed preemption check and cause lower latencies. Investigating further, I found that the recusion caused by the function tracer was not due to schedule(), but due to preempt_schedule(). Now that preempt_schedule is completely annotated with notrace, the recusion no longer is an issue. Reported-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-06-03 21:36:50 +08:00
int cpu;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/* If we are tracing schedule, we don't want to recurse */
tracing: Remove ftrace_preempt_disable/enable The ftrace_preempt_disable/enable functions were to address a recursive race caused by the function tracer. The function tracer traces all functions which makes it easily susceptible to recursion. One area was preempt_enable(). This would call the scheduler and the schedulre would call the function tracer and loop. (So was it thought). The ftrace_preempt_disable/enable was made to protect against recursion inside the scheduler by storing the NEED_RESCHED flag. If it was set before the ftrace_preempt_disable() it would not call schedule on ftrace_preempt_enable(), thinking that if it was set before then it would have already scheduled unless it was already in the scheduler. This worked fine except in the case of SMP, where another task would set the NEED_RESCHED flag for a task on another CPU, and then kick off an IPI to trigger it. This could cause the NEED_RESCHED to be saved at ftrace_preempt_disable() but the IPI to arrive in the the preempt disabled section. The ftrace_preempt_enable() would not call the scheduler because the flag was already set before entring the section. This bug would cause a missed preemption check and cause lower latencies. Investigating further, I found that the recusion caused by the function tracer was not due to schedule(), but due to preempt_schedule(). Now that preempt_schedule is completely annotated with notrace, the recusion no longer is an issue. Reported-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-06-03 21:36:50 +08:00
preempt_disable_notrace();
ring-buffer: Add unlikelys to make fast path the default I was running the trace_event benchmark and noticed that the times to record a trace_event was all over the place. I looked at the assembly of the ring_buffer_lock_reserver() and saw this: <ring_buffer_lock_reserve>: 31 c0 xor %eax,%eax 48 83 3d 76 47 bd 00 cmpq $0x1,0xbd4776(%rip) # ffffffff81d10d60 <ring_buffer_flags> 01 55 push %rbp 48 89 e5 mov %rsp,%rbp 75 1d jne ffffffff8113c60d <ring_buffer_lock_reserve+0x2d> 65 ff 05 69 e3 ec 7e incl %gs:0x7eece369(%rip) # a960 <__preempt_count> 8b 47 08 mov 0x8(%rdi),%eax 85 c0 test %eax,%eax +---- 74 12 je ffffffff8113c610 <ring_buffer_lock_reserve+0x30> | 65 ff 0d 5b e3 ec 7e decl %gs:0x7eece35b(%rip) # a960 <__preempt_count> | 0f 84 85 00 00 00 je ffffffff8113c690 <ring_buffer_lock_reserve+0xb0> | 31 c0 xor %eax,%eax | 5d pop %rbp | c3 retq | 90 nop +---> 65 44 8b 05 48 e3 ec mov %gs:0x7eece348(%rip),%r8d # a960 <__preempt_count> 7e 41 81 e0 ff ff ff 7f and $0x7fffffff,%r8d b0 08 mov $0x8,%al 65 8b 0d 58 36 ed 7e mov %gs:0x7eed3658(%rip),%ecx # fc80 <current_context> 41 f7 c0 00 ff 1f 00 test $0x1fff00,%r8d 74 1e je ffffffff8113c64f <ring_buffer_lock_reserve+0x6f> 41 f7 c0 00 00 10 00 test $0x100000,%r8d b0 01 mov $0x1,%al 75 13 jne ffffffff8113c64f <ring_buffer_lock_reserve+0x6f> 41 81 e0 00 00 0f 00 and $0xf0000,%r8d 49 83 f8 01 cmp $0x1,%r8 19 c0 sbb %eax,%eax 83 e0 02 and $0x2,%eax 83 c0 02 add $0x2,%eax 85 c8 test %ecx,%eax 75 ab jne ffffffff8113c5fe <ring_buffer_lock_reserve+0x1e> 09 c8 or %ecx,%eax 65 89 05 24 36 ed 7e mov %eax,%gs:0x7eed3624(%rip) # fc80 <current_context> The arrow is the fast path. After adding the unlikely's, the fast path looks a bit better: <ring_buffer_lock_reserve>: 31 c0 xor %eax,%eax 48 83 3d 76 47 bd 00 cmpq $0x1,0xbd4776(%rip) # ffffffff81d10d60 <ring_buffer_flags> 01 55 push %rbp 48 89 e5 mov %rsp,%rbp 75 7b jne ffffffff8113c66b <ring_buffer_lock_reserve+0x8b> 65 ff 05 69 e3 ec 7e incl %gs:0x7eece369(%rip) # a960 <__preempt_count> 8b 47 08 mov 0x8(%rdi),%eax 85 c0 test %eax,%eax 0f 85 9f 00 00 00 jne ffffffff8113c6a1 <ring_buffer_lock_reserve+0xc1> 65 8b 0d 57 e3 ec 7e mov %gs:0x7eece357(%rip),%ecx # a960 <__preempt_count> 81 e1 ff ff ff 7f and $0x7fffffff,%ecx b0 08 mov $0x8,%al 65 8b 15 68 36 ed 7e mov %gs:0x7eed3668(%rip),%edx # fc80 <current_context> f7 c1 00 ff 1f 00 test $0x1fff00,%ecx 75 50 jne ffffffff8113c670 <ring_buffer_lock_reserve+0x90> 85 d0 test %edx,%eax 75 7d jne ffffffff8113c6a1 <ring_buffer_lock_reserve+0xc1> 09 d0 or %edx,%eax 65 89 05 53 36 ed 7e mov %eax,%gs:0x7eed3653(%rip) # fc80 <current_context> 65 8b 05 fc da ec 7e mov %gs:0x7eecdafc(%rip),%eax # a130 <cpu_number> 89 c2 mov %eax,%edx Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-05-22 05:39:29 +08:00
if (unlikely(atomic_read(&buffer->record_disabled)))
goto out;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
cpu = raw_smp_processor_id();
ring-buffer: Add unlikelys to make fast path the default I was running the trace_event benchmark and noticed that the times to record a trace_event was all over the place. I looked at the assembly of the ring_buffer_lock_reserver() and saw this: <ring_buffer_lock_reserve>: 31 c0 xor %eax,%eax 48 83 3d 76 47 bd 00 cmpq $0x1,0xbd4776(%rip) # ffffffff81d10d60 <ring_buffer_flags> 01 55 push %rbp 48 89 e5 mov %rsp,%rbp 75 1d jne ffffffff8113c60d <ring_buffer_lock_reserve+0x2d> 65 ff 05 69 e3 ec 7e incl %gs:0x7eece369(%rip) # a960 <__preempt_count> 8b 47 08 mov 0x8(%rdi),%eax 85 c0 test %eax,%eax +---- 74 12 je ffffffff8113c610 <ring_buffer_lock_reserve+0x30> | 65 ff 0d 5b e3 ec 7e decl %gs:0x7eece35b(%rip) # a960 <__preempt_count> | 0f 84 85 00 00 00 je ffffffff8113c690 <ring_buffer_lock_reserve+0xb0> | 31 c0 xor %eax,%eax | 5d pop %rbp | c3 retq | 90 nop +---> 65 44 8b 05 48 e3 ec mov %gs:0x7eece348(%rip),%r8d # a960 <__preempt_count> 7e 41 81 e0 ff ff ff 7f and $0x7fffffff,%r8d b0 08 mov $0x8,%al 65 8b 0d 58 36 ed 7e mov %gs:0x7eed3658(%rip),%ecx # fc80 <current_context> 41 f7 c0 00 ff 1f 00 test $0x1fff00,%r8d 74 1e je ffffffff8113c64f <ring_buffer_lock_reserve+0x6f> 41 f7 c0 00 00 10 00 test $0x100000,%r8d b0 01 mov $0x1,%al 75 13 jne ffffffff8113c64f <ring_buffer_lock_reserve+0x6f> 41 81 e0 00 00 0f 00 and $0xf0000,%r8d 49 83 f8 01 cmp $0x1,%r8 19 c0 sbb %eax,%eax 83 e0 02 and $0x2,%eax 83 c0 02 add $0x2,%eax 85 c8 test %ecx,%eax 75 ab jne ffffffff8113c5fe <ring_buffer_lock_reserve+0x1e> 09 c8 or %ecx,%eax 65 89 05 24 36 ed 7e mov %eax,%gs:0x7eed3624(%rip) # fc80 <current_context> The arrow is the fast path. After adding the unlikely's, the fast path looks a bit better: <ring_buffer_lock_reserve>: 31 c0 xor %eax,%eax 48 83 3d 76 47 bd 00 cmpq $0x1,0xbd4776(%rip) # ffffffff81d10d60 <ring_buffer_flags> 01 55 push %rbp 48 89 e5 mov %rsp,%rbp 75 7b jne ffffffff8113c66b <ring_buffer_lock_reserve+0x8b> 65 ff 05 69 e3 ec 7e incl %gs:0x7eece369(%rip) # a960 <__preempt_count> 8b 47 08 mov 0x8(%rdi),%eax 85 c0 test %eax,%eax 0f 85 9f 00 00 00 jne ffffffff8113c6a1 <ring_buffer_lock_reserve+0xc1> 65 8b 0d 57 e3 ec 7e mov %gs:0x7eece357(%rip),%ecx # a960 <__preempt_count> 81 e1 ff ff ff 7f and $0x7fffffff,%ecx b0 08 mov $0x8,%al 65 8b 15 68 36 ed 7e mov %gs:0x7eed3668(%rip),%edx # fc80 <current_context> f7 c1 00 ff 1f 00 test $0x1fff00,%ecx 75 50 jne ffffffff8113c670 <ring_buffer_lock_reserve+0x90> 85 d0 test %edx,%eax 75 7d jne ffffffff8113c6a1 <ring_buffer_lock_reserve+0xc1> 09 d0 or %edx,%eax 65 89 05 53 36 ed 7e mov %eax,%gs:0x7eed3653(%rip) # fc80 <current_context> 65 8b 05 fc da ec 7e mov %gs:0x7eecdafc(%rip),%eax # a130 <cpu_number> 89 c2 mov %eax,%edx Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-05-22 05:39:29 +08:00
if (unlikely(!cpumask_test_cpu(cpu, buffer->cpumask)))
goto out;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
cpu_buffer = buffer->buffers[cpu];
ring-buffer: Add unlikelys to make fast path the default I was running the trace_event benchmark and noticed that the times to record a trace_event was all over the place. I looked at the assembly of the ring_buffer_lock_reserver() and saw this: <ring_buffer_lock_reserve>: 31 c0 xor %eax,%eax 48 83 3d 76 47 bd 00 cmpq $0x1,0xbd4776(%rip) # ffffffff81d10d60 <ring_buffer_flags> 01 55 push %rbp 48 89 e5 mov %rsp,%rbp 75 1d jne ffffffff8113c60d <ring_buffer_lock_reserve+0x2d> 65 ff 05 69 e3 ec 7e incl %gs:0x7eece369(%rip) # a960 <__preempt_count> 8b 47 08 mov 0x8(%rdi),%eax 85 c0 test %eax,%eax +---- 74 12 je ffffffff8113c610 <ring_buffer_lock_reserve+0x30> | 65 ff 0d 5b e3 ec 7e decl %gs:0x7eece35b(%rip) # a960 <__preempt_count> | 0f 84 85 00 00 00 je ffffffff8113c690 <ring_buffer_lock_reserve+0xb0> | 31 c0 xor %eax,%eax | 5d pop %rbp | c3 retq | 90 nop +---> 65 44 8b 05 48 e3 ec mov %gs:0x7eece348(%rip),%r8d # a960 <__preempt_count> 7e 41 81 e0 ff ff ff 7f and $0x7fffffff,%r8d b0 08 mov $0x8,%al 65 8b 0d 58 36 ed 7e mov %gs:0x7eed3658(%rip),%ecx # fc80 <current_context> 41 f7 c0 00 ff 1f 00 test $0x1fff00,%r8d 74 1e je ffffffff8113c64f <ring_buffer_lock_reserve+0x6f> 41 f7 c0 00 00 10 00 test $0x100000,%r8d b0 01 mov $0x1,%al 75 13 jne ffffffff8113c64f <ring_buffer_lock_reserve+0x6f> 41 81 e0 00 00 0f 00 and $0xf0000,%r8d 49 83 f8 01 cmp $0x1,%r8 19 c0 sbb %eax,%eax 83 e0 02 and $0x2,%eax 83 c0 02 add $0x2,%eax 85 c8 test %ecx,%eax 75 ab jne ffffffff8113c5fe <ring_buffer_lock_reserve+0x1e> 09 c8 or %ecx,%eax 65 89 05 24 36 ed 7e mov %eax,%gs:0x7eed3624(%rip) # fc80 <current_context> The arrow is the fast path. After adding the unlikely's, the fast path looks a bit better: <ring_buffer_lock_reserve>: 31 c0 xor %eax,%eax 48 83 3d 76 47 bd 00 cmpq $0x1,0xbd4776(%rip) # ffffffff81d10d60 <ring_buffer_flags> 01 55 push %rbp 48 89 e5 mov %rsp,%rbp 75 7b jne ffffffff8113c66b <ring_buffer_lock_reserve+0x8b> 65 ff 05 69 e3 ec 7e incl %gs:0x7eece369(%rip) # a960 <__preempt_count> 8b 47 08 mov 0x8(%rdi),%eax 85 c0 test %eax,%eax 0f 85 9f 00 00 00 jne ffffffff8113c6a1 <ring_buffer_lock_reserve+0xc1> 65 8b 0d 57 e3 ec 7e mov %gs:0x7eece357(%rip),%ecx # a960 <__preempt_count> 81 e1 ff ff ff 7f and $0x7fffffff,%ecx b0 08 mov $0x8,%al 65 8b 15 68 36 ed 7e mov %gs:0x7eed3668(%rip),%edx # fc80 <current_context> f7 c1 00 ff 1f 00 test $0x1fff00,%ecx 75 50 jne ffffffff8113c670 <ring_buffer_lock_reserve+0x90> 85 d0 test %edx,%eax 75 7d jne ffffffff8113c6a1 <ring_buffer_lock_reserve+0xc1> 09 d0 or %edx,%eax 65 89 05 53 36 ed 7e mov %eax,%gs:0x7eed3653(%rip) # fc80 <current_context> 65 8b 05 fc da ec 7e mov %gs:0x7eecdafc(%rip),%eax # a130 <cpu_number> 89 c2 mov %eax,%edx Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-05-22 05:39:29 +08:00
if (unlikely(atomic_read(&cpu_buffer->record_disabled)))
goto out;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
ring-buffer: Add unlikelys to make fast path the default I was running the trace_event benchmark and noticed that the times to record a trace_event was all over the place. I looked at the assembly of the ring_buffer_lock_reserver() and saw this: <ring_buffer_lock_reserve>: 31 c0 xor %eax,%eax 48 83 3d 76 47 bd 00 cmpq $0x1,0xbd4776(%rip) # ffffffff81d10d60 <ring_buffer_flags> 01 55 push %rbp 48 89 e5 mov %rsp,%rbp 75 1d jne ffffffff8113c60d <ring_buffer_lock_reserve+0x2d> 65 ff 05 69 e3 ec 7e incl %gs:0x7eece369(%rip) # a960 <__preempt_count> 8b 47 08 mov 0x8(%rdi),%eax 85 c0 test %eax,%eax +---- 74 12 je ffffffff8113c610 <ring_buffer_lock_reserve+0x30> | 65 ff 0d 5b e3 ec 7e decl %gs:0x7eece35b(%rip) # a960 <__preempt_count> | 0f 84 85 00 00 00 je ffffffff8113c690 <ring_buffer_lock_reserve+0xb0> | 31 c0 xor %eax,%eax | 5d pop %rbp | c3 retq | 90 nop +---> 65 44 8b 05 48 e3 ec mov %gs:0x7eece348(%rip),%r8d # a960 <__preempt_count> 7e 41 81 e0 ff ff ff 7f and $0x7fffffff,%r8d b0 08 mov $0x8,%al 65 8b 0d 58 36 ed 7e mov %gs:0x7eed3658(%rip),%ecx # fc80 <current_context> 41 f7 c0 00 ff 1f 00 test $0x1fff00,%r8d 74 1e je ffffffff8113c64f <ring_buffer_lock_reserve+0x6f> 41 f7 c0 00 00 10 00 test $0x100000,%r8d b0 01 mov $0x1,%al 75 13 jne ffffffff8113c64f <ring_buffer_lock_reserve+0x6f> 41 81 e0 00 00 0f 00 and $0xf0000,%r8d 49 83 f8 01 cmp $0x1,%r8 19 c0 sbb %eax,%eax 83 e0 02 and $0x2,%eax 83 c0 02 add $0x2,%eax 85 c8 test %ecx,%eax 75 ab jne ffffffff8113c5fe <ring_buffer_lock_reserve+0x1e> 09 c8 or %ecx,%eax 65 89 05 24 36 ed 7e mov %eax,%gs:0x7eed3624(%rip) # fc80 <current_context> The arrow is the fast path. After adding the unlikely's, the fast path looks a bit better: <ring_buffer_lock_reserve>: 31 c0 xor %eax,%eax 48 83 3d 76 47 bd 00 cmpq $0x1,0xbd4776(%rip) # ffffffff81d10d60 <ring_buffer_flags> 01 55 push %rbp 48 89 e5 mov %rsp,%rbp 75 7b jne ffffffff8113c66b <ring_buffer_lock_reserve+0x8b> 65 ff 05 69 e3 ec 7e incl %gs:0x7eece369(%rip) # a960 <__preempt_count> 8b 47 08 mov 0x8(%rdi),%eax 85 c0 test %eax,%eax 0f 85 9f 00 00 00 jne ffffffff8113c6a1 <ring_buffer_lock_reserve+0xc1> 65 8b 0d 57 e3 ec 7e mov %gs:0x7eece357(%rip),%ecx # a960 <__preempt_count> 81 e1 ff ff ff 7f and $0x7fffffff,%ecx b0 08 mov $0x8,%al 65 8b 15 68 36 ed 7e mov %gs:0x7eed3668(%rip),%edx # fc80 <current_context> f7 c1 00 ff 1f 00 test $0x1fff00,%ecx 75 50 jne ffffffff8113c670 <ring_buffer_lock_reserve+0x90> 85 d0 test %edx,%eax 75 7d jne ffffffff8113c6a1 <ring_buffer_lock_reserve+0xc1> 09 d0 or %edx,%eax 65 89 05 53 36 ed 7e mov %eax,%gs:0x7eed3653(%rip) # fc80 <current_context> 65 8b 05 fc da ec 7e mov %gs:0x7eecdafc(%rip),%eax # a130 <cpu_number> 89 c2 mov %eax,%edx Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2015-05-22 05:39:29 +08:00
if (unlikely(length > BUF_MAX_DATA_SIZE))
goto out;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
if (unlikely(trace_recursive_lock(cpu_buffer)))
goto out;
event = rb_reserve_next_event(buffer, cpu_buffer, length);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
if (!event)
goto out_unlock;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
return event;
out_unlock:
trace_recursive_unlock(cpu_buffer);
out:
tracing: Remove ftrace_preempt_disable/enable The ftrace_preempt_disable/enable functions were to address a recursive race caused by the function tracer. The function tracer traces all functions which makes it easily susceptible to recursion. One area was preempt_enable(). This would call the scheduler and the schedulre would call the function tracer and loop. (So was it thought). The ftrace_preempt_disable/enable was made to protect against recursion inside the scheduler by storing the NEED_RESCHED flag. If it was set before the ftrace_preempt_disable() it would not call schedule on ftrace_preempt_enable(), thinking that if it was set before then it would have already scheduled unless it was already in the scheduler. This worked fine except in the case of SMP, where another task would set the NEED_RESCHED flag for a task on another CPU, and then kick off an IPI to trigger it. This could cause the NEED_RESCHED to be saved at ftrace_preempt_disable() but the IPI to arrive in the the preempt disabled section. The ftrace_preempt_enable() would not call the scheduler because the flag was already set before entring the section. This bug would cause a missed preemption check and cause lower latencies. Investigating further, I found that the recusion caused by the function tracer was not due to schedule(), but due to preempt_schedule(). Now that preempt_schedule is completely annotated with notrace, the recusion no longer is an issue. Reported-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-06-03 21:36:50 +08:00
preempt_enable_notrace();
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
return NULL;
}
EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/*
* Decrement the entries to the page that an event is on.
* The event does not even need to exist, only the pointer
* to the page it is on. This may only be called before the commit
* takes place.
*/
static inline void
rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event)
{
unsigned long addr = (unsigned long)event;
struct buffer_page *bpage = cpu_buffer->commit_page;
struct buffer_page *start;
addr &= PAGE_MASK;
/* Do the likely case first */
if (likely(bpage->page == (void *)addr)) {
local_dec(&bpage->entries);
return;
}
/*
* Because the commit page may be on the reader page we
* start with the next page and check the end loop there.
*/
rb_inc_page(&bpage);
start = bpage;
do {
if (bpage->page == (void *)addr) {
local_dec(&bpage->entries);
return;
}
rb_inc_page(&bpage);
} while (bpage != start);
/* commit not part of this buffer?? */
RB_WARN_ON(cpu_buffer, 1);
}
/**
* ring_buffer_discard_commit - discard an event that has not been committed
* @buffer: the ring buffer
* @event: non committed event to discard
*
* Sometimes an event that is in the ring buffer needs to be ignored.
* This function lets the user discard an event in the ring buffer
* and then that event will not be read later.
*
* This function only works if it is called before the item has been
* committed. It will try to free the event from the ring buffer
* if another event has not been added behind it.
*
* If another event has been added behind it, it will set the event
* up as discarded, and perform the commit.
*
* If this function is called, do not call ring_buffer_unlock_commit on
* the event.
*/
void ring_buffer_discard_commit(struct trace_buffer *buffer,
struct ring_buffer_event *event)
{
struct ring_buffer_per_cpu *cpu_buffer;
int cpu;
/* The event is discarded regardless */
tracing/ring-buffer: Add unlock recursion protection on discard The pair of helpers trace_recursive_lock() and trace_recursive_unlock() have been introduced recently to provide generic tracing recursion protection. They are used in a symetric way: - trace_recursive_lock() on buffer reserve - trace_recursive_unlock() on buffer commit However sometimes, we don't commit but discard on entry to the buffer, ie: in case of filter checking. Then we must also unlock the recursion protection on discard time, otherwise the tracing gets definitely deactivated and a warning is raised spuriously, such as: 111.119821] ------------[ cut here ]------------ [ 111.119829] WARNING: at kernel/trace/ring_buffer.c:1498 ring_buffer_lock_reserve+0x1b7/0x1d0() [ 111.119835] Hardware name: AMILO Li 2727 [ 111.119839] Modules linked in: [ 111.119846] Pid: 5731, comm: Xorg Tainted: G W 2.6.30-rc1 #69 [ 111.119851] Call Trace: [ 111.119863] [<ffffffff8025ce68>] warn_slowpath+0xd8/0x130 [ 111.119873] [<ffffffff8028a30f>] ? __lock_acquire+0x19f/0x1ae0 [ 111.119882] [<ffffffff8028a30f>] ? __lock_acquire+0x19f/0x1ae0 [ 111.119891] [<ffffffff802199b0>] ? native_sched_clock+0x20/0x70 [ 111.119899] [<ffffffff80286dee>] ? put_lock_stats+0xe/0x30 [ 111.119906] [<ffffffff80286eb8>] ? lock_release_holdtime+0xa8/0x150 [ 111.119913] [<ffffffff802c8ae7>] ring_buffer_lock_reserve+0x1b7/0x1d0 [ 111.119921] [<ffffffff802cd110>] trace_buffer_lock_reserve+0x30/0x70 [ 111.119930] [<ffffffff802ce000>] trace_current_buffer_lock_reserve+0x20/0x30 [ 111.119939] [<ffffffff802474e8>] ftrace_raw_event_sched_switch+0x58/0x100 [ 111.119948] [<ffffffff808103b7>] __schedule+0x3a7/0x4cd [ 111.119957] [<ffffffff80211b56>] ? ftrace_call+0x5/0x2b [ 111.119964] [<ffffffff80211b56>] ? ftrace_call+0x5/0x2b [ 111.119971] [<ffffffff80810c08>] schedule+0x18/0x40 [ 111.119977] [<ffffffff80810e09>] preempt_schedule+0x39/0x60 [ 111.119985] [<ffffffff80813bd3>] _read_unlock+0x53/0x60 [ 111.119993] [<ffffffff807259d2>] sock_def_readable+0x72/0x80 [ 111.120002] [<ffffffff807ad5ed>] unix_stream_sendmsg+0x24d/0x3d0 [ 111.120011] [<ffffffff807219a3>] sock_aio_write+0x143/0x160 [ 111.120019] [<ffffffff80211b56>] ? ftrace_call+0x5/0x2b [ 111.120026] [<ffffffff80721860>] ? sock_aio_write+0x0/0x160 [ 111.120033] [<ffffffff80721860>] ? sock_aio_write+0x0/0x160 [ 111.120042] [<ffffffff8031c283>] do_sync_readv_writev+0xf3/0x140 [ 111.120049] [<ffffffff80211b56>] ? ftrace_call+0x5/0x2b [ 111.120057] [<ffffffff80276ff0>] ? autoremove_wake_function+0x0/0x40 [ 111.120067] [<ffffffff8045d489>] ? cap_file_permission+0x9/0x10 [ 111.120074] [<ffffffff8045c1e6>] ? security_file_permission+0x16/0x20 [ 111.120082] [<ffffffff8031cab4>] do_readv_writev+0xd4/0x1f0 [ 111.120089] [<ffffffff80211b56>] ? ftrace_call+0x5/0x2b [ 111.120097] [<ffffffff80211b56>] ? ftrace_call+0x5/0x2b [ 111.120105] [<ffffffff8031cc18>] vfs_writev+0x48/0x70 [ 111.120111] [<ffffffff8031cd65>] sys_writev+0x55/0xc0 [ 111.120119] [<ffffffff80211e32>] system_call_fastpath+0x16/0x1b [ 111.120125] ---[ end trace 15605f4e98d5ccb5 ]--- [ Impact: fix spurious warning triggering tracing shutdown ] Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2009-04-20 05:39:33 +08:00
rb_event_discard(event);
ring-buffer: use commit counters for commit pointer accounting The ring buffer is made up of three sets of pointers. The head page pointer, which points to the next page for the reader to get. The commit pointer and commit index, which points to the page and index of the last committed write respectively. The tail pointer and tail index, which points to the page and the index of the last reserved data respectively (non committed). The commit pointer is only moved forward by the outer most writer. If a nested writer comes in, it will not move the pointer forward. The current implementation has a flaw. It assumes that the outer most writer successfully reserved data. There's a small race window where the outer most writer could find the tail pointer, but a nested writer could come in (via interrupt) and move the tail forward, and even the commit forward. The outer writer would not realized the commit moved forward and the accounting will break. This patch changes the design to use counters in the per cpu buffers to keep track of commits. The counters are incremented at the start of the commit, and decremented at the end. If the end commit counter is 1, then it moves the commit pointers. A loop is made to check for races between checking and moving the commit pointers. Only the outer commit should move the pointers anyway. The test of knowing if a reserve is equal to the last commit update is still needed to know for time keeping. The time code is much less racey than the commit updates. This change not only solves the mentioned race, but also makes the code simpler. [ Impact: fix commit race and simplify code ] Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-06-17 00:37:57 +08:00
cpu = smp_processor_id();
cpu_buffer = buffer->buffers[cpu];
/*
* This must only be called if the event has not been
* committed yet. Thus we can assume that preemption
* is still disabled.
*/
ring-buffer: use commit counters for commit pointer accounting The ring buffer is made up of three sets of pointers. The head page pointer, which points to the next page for the reader to get. The commit pointer and commit index, which points to the page and index of the last committed write respectively. The tail pointer and tail index, which points to the page and the index of the last reserved data respectively (non committed). The commit pointer is only moved forward by the outer most writer. If a nested writer comes in, it will not move the pointer forward. The current implementation has a flaw. It assumes that the outer most writer successfully reserved data. There's a small race window where the outer most writer could find the tail pointer, but a nested writer could come in (via interrupt) and move the tail forward, and even the commit forward. The outer writer would not realized the commit moved forward and the accounting will break. This patch changes the design to use counters in the per cpu buffers to keep track of commits. The counters are incremented at the start of the commit, and decremented at the end. If the end commit counter is 1, then it moves the commit pointers. A loop is made to check for races between checking and moving the commit pointers. Only the outer commit should move the pointers anyway. The test of knowing if a reserve is equal to the last commit update is still needed to know for time keeping. The time code is much less racey than the commit updates. This change not only solves the mentioned race, but also makes the code simpler. [ Impact: fix commit race and simplify code ] Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-06-17 00:37:57 +08:00
RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
rb_decrement_entry(cpu_buffer, event);
if (rb_try_to_discard(cpu_buffer, event))
goto out;
out:
ring-buffer: use commit counters for commit pointer accounting The ring buffer is made up of three sets of pointers. The head page pointer, which points to the next page for the reader to get. The commit pointer and commit index, which points to the page and index of the last committed write respectively. The tail pointer and tail index, which points to the page and the index of the last reserved data respectively (non committed). The commit pointer is only moved forward by the outer most writer. If a nested writer comes in, it will not move the pointer forward. The current implementation has a flaw. It assumes that the outer most writer successfully reserved data. There's a small race window where the outer most writer could find the tail pointer, but a nested writer could come in (via interrupt) and move the tail forward, and even the commit forward. The outer writer would not realized the commit moved forward and the accounting will break. This patch changes the design to use counters in the per cpu buffers to keep track of commits. The counters are incremented at the start of the commit, and decremented at the end. If the end commit counter is 1, then it moves the commit pointers. A loop is made to check for races between checking and moving the commit pointers. Only the outer commit should move the pointers anyway. The test of knowing if a reserve is equal to the last commit update is still needed to know for time keeping. The time code is much less racey than the commit updates. This change not only solves the mentioned race, but also makes the code simpler. [ Impact: fix commit race and simplify code ] Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-06-17 00:37:57 +08:00
rb_end_commit(cpu_buffer);
trace_recursive_unlock(cpu_buffer);
tracing/ring-buffer: Add unlock recursion protection on discard The pair of helpers trace_recursive_lock() and trace_recursive_unlock() have been introduced recently to provide generic tracing recursion protection. They are used in a symetric way: - trace_recursive_lock() on buffer reserve - trace_recursive_unlock() on buffer commit However sometimes, we don't commit but discard on entry to the buffer, ie: in case of filter checking. Then we must also unlock the recursion protection on discard time, otherwise the tracing gets definitely deactivated and a warning is raised spuriously, such as: 111.119821] ------------[ cut here ]------------ [ 111.119829] WARNING: at kernel/trace/ring_buffer.c:1498 ring_buffer_lock_reserve+0x1b7/0x1d0() [ 111.119835] Hardware name: AMILO Li 2727 [ 111.119839] Modules linked in: [ 111.119846] Pid: 5731, comm: Xorg Tainted: G W 2.6.30-rc1 #69 [ 111.119851] Call Trace: [ 111.119863] [<ffffffff8025ce68>] warn_slowpath+0xd8/0x130 [ 111.119873] [<ffffffff8028a30f>] ? __lock_acquire+0x19f/0x1ae0 [ 111.119882] [<ffffffff8028a30f>] ? __lock_acquire+0x19f/0x1ae0 [ 111.119891] [<ffffffff802199b0>] ? native_sched_clock+0x20/0x70 [ 111.119899] [<ffffffff80286dee>] ? put_lock_stats+0xe/0x30 [ 111.119906] [<ffffffff80286eb8>] ? lock_release_holdtime+0xa8/0x150 [ 111.119913] [<ffffffff802c8ae7>] ring_buffer_lock_reserve+0x1b7/0x1d0 [ 111.119921] [<ffffffff802cd110>] trace_buffer_lock_reserve+0x30/0x70 [ 111.119930] [<ffffffff802ce000>] trace_current_buffer_lock_reserve+0x20/0x30 [ 111.119939] [<ffffffff802474e8>] ftrace_raw_event_sched_switch+0x58/0x100 [ 111.119948] [<ffffffff808103b7>] __schedule+0x3a7/0x4cd [ 111.119957] [<ffffffff80211b56>] ? ftrace_call+0x5/0x2b [ 111.119964] [<ffffffff80211b56>] ? ftrace_call+0x5/0x2b [ 111.119971] [<ffffffff80810c08>] schedule+0x18/0x40 [ 111.119977] [<ffffffff80810e09>] preempt_schedule+0x39/0x60 [ 111.119985] [<ffffffff80813bd3>] _read_unlock+0x53/0x60 [ 111.119993] [<ffffffff807259d2>] sock_def_readable+0x72/0x80 [ 111.120002] [<ffffffff807ad5ed>] unix_stream_sendmsg+0x24d/0x3d0 [ 111.120011] [<ffffffff807219a3>] sock_aio_write+0x143/0x160 [ 111.120019] [<ffffffff80211b56>] ? ftrace_call+0x5/0x2b [ 111.120026] [<ffffffff80721860>] ? sock_aio_write+0x0/0x160 [ 111.120033] [<ffffffff80721860>] ? sock_aio_write+0x0/0x160 [ 111.120042] [<ffffffff8031c283>] do_sync_readv_writev+0xf3/0x140 [ 111.120049] [<ffffffff80211b56>] ? ftrace_call+0x5/0x2b [ 111.120057] [<ffffffff80276ff0>] ? autoremove_wake_function+0x0/0x40 [ 111.120067] [<ffffffff8045d489>] ? cap_file_permission+0x9/0x10 [ 111.120074] [<ffffffff8045c1e6>] ? security_file_permission+0x16/0x20 [ 111.120082] [<ffffffff8031cab4>] do_readv_writev+0xd4/0x1f0 [ 111.120089] [<ffffffff80211b56>] ? ftrace_call+0x5/0x2b [ 111.120097] [<ffffffff80211b56>] ? ftrace_call+0x5/0x2b [ 111.120105] [<ffffffff8031cc18>] vfs_writev+0x48/0x70 [ 111.120111] [<ffffffff8031cd65>] sys_writev+0x55/0xc0 [ 111.120119] [<ffffffff80211e32>] system_call_fastpath+0x16/0x1b [ 111.120125] ---[ end trace 15605f4e98d5ccb5 ]--- [ Impact: fix spurious warning triggering tracing shutdown ] Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2009-04-20 05:39:33 +08:00
tracing: Remove ftrace_preempt_disable/enable The ftrace_preempt_disable/enable functions were to address a recursive race caused by the function tracer. The function tracer traces all functions which makes it easily susceptible to recursion. One area was preempt_enable(). This would call the scheduler and the schedulre would call the function tracer and loop. (So was it thought). The ftrace_preempt_disable/enable was made to protect against recursion inside the scheduler by storing the NEED_RESCHED flag. If it was set before the ftrace_preempt_disable() it would not call schedule on ftrace_preempt_enable(), thinking that if it was set before then it would have already scheduled unless it was already in the scheduler. This worked fine except in the case of SMP, where another task would set the NEED_RESCHED flag for a task on another CPU, and then kick off an IPI to trigger it. This could cause the NEED_RESCHED to be saved at ftrace_preempt_disable() but the IPI to arrive in the the preempt disabled section. The ftrace_preempt_enable() would not call the scheduler because the flag was already set before entring the section. This bug would cause a missed preemption check and cause lower latencies. Investigating further, I found that the recusion caused by the function tracer was not due to schedule(), but due to preempt_schedule(). Now that preempt_schedule is completely annotated with notrace, the recusion no longer is an issue. Reported-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-06-03 21:36:50 +08:00
preempt_enable_notrace();
}
EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/**
* ring_buffer_write - write data to the buffer without reserving
* @buffer: The ring buffer to write to.
* @length: The length of the data being written (excluding the event header)
* @data: The data to write to the buffer.
*
* This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as
* one function. If you already have the data to write to the buffer, it
* may be easier to simply call this function.
*
* Note, like ring_buffer_lock_reserve, the length is the length of the data
* and not the length of the event which would hold the header.
*/
int ring_buffer_write(struct trace_buffer *buffer,
unsigned long length,
void *data)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event;
void *body;
int ret = -EBUSY;
tracing: Remove ftrace_preempt_disable/enable The ftrace_preempt_disable/enable functions were to address a recursive race caused by the function tracer. The function tracer traces all functions which makes it easily susceptible to recursion. One area was preempt_enable(). This would call the scheduler and the schedulre would call the function tracer and loop. (So was it thought). The ftrace_preempt_disable/enable was made to protect against recursion inside the scheduler by storing the NEED_RESCHED flag. If it was set before the ftrace_preempt_disable() it would not call schedule on ftrace_preempt_enable(), thinking that if it was set before then it would have already scheduled unless it was already in the scheduler. This worked fine except in the case of SMP, where another task would set the NEED_RESCHED flag for a task on another CPU, and then kick off an IPI to trigger it. This could cause the NEED_RESCHED to be saved at ftrace_preempt_disable() but the IPI to arrive in the the preempt disabled section. The ftrace_preempt_enable() would not call the scheduler because the flag was already set before entring the section. This bug would cause a missed preemption check and cause lower latencies. Investigating further, I found that the recusion caused by the function tracer was not due to schedule(), but due to preempt_schedule(). Now that preempt_schedule is completely annotated with notrace, the recusion no longer is an issue. Reported-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-06-03 21:36:50 +08:00
int cpu;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
tracing: Remove ftrace_preempt_disable/enable The ftrace_preempt_disable/enable functions were to address a recursive race caused by the function tracer. The function tracer traces all functions which makes it easily susceptible to recursion. One area was preempt_enable(). This would call the scheduler and the schedulre would call the function tracer and loop. (So was it thought). The ftrace_preempt_disable/enable was made to protect against recursion inside the scheduler by storing the NEED_RESCHED flag. If it was set before the ftrace_preempt_disable() it would not call schedule on ftrace_preempt_enable(), thinking that if it was set before then it would have already scheduled unless it was already in the scheduler. This worked fine except in the case of SMP, where another task would set the NEED_RESCHED flag for a task on another CPU, and then kick off an IPI to trigger it. This could cause the NEED_RESCHED to be saved at ftrace_preempt_disable() but the IPI to arrive in the the preempt disabled section. The ftrace_preempt_enable() would not call the scheduler because the flag was already set before entring the section. This bug would cause a missed preemption check and cause lower latencies. Investigating further, I found that the recusion caused by the function tracer was not due to schedule(), but due to preempt_schedule(). Now that preempt_schedule is completely annotated with notrace, the recusion no longer is an issue. Reported-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-06-03 21:36:50 +08:00
preempt_disable_notrace();
if (atomic_read(&buffer->record_disabled))
goto out;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
cpu = raw_smp_processor_id();
if (!cpumask_test_cpu(cpu, buffer->cpumask))
goto out;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
cpu_buffer = buffer->buffers[cpu];
if (atomic_read(&cpu_buffer->record_disabled))
goto out;
if (length > BUF_MAX_DATA_SIZE)
goto out;
if (unlikely(trace_recursive_lock(cpu_buffer)))
goto out;
event = rb_reserve_next_event(buffer, cpu_buffer, length);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
if (!event)
goto out_unlock;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
body = rb_event_data(event);
memcpy(body, data, length);
rb_commit(cpu_buffer);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
rb_wakeups(buffer, cpu_buffer);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
ret = 0;
out_unlock:
trace_recursive_unlock(cpu_buffer);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
out:
tracing: Remove ftrace_preempt_disable/enable The ftrace_preempt_disable/enable functions were to address a recursive race caused by the function tracer. The function tracer traces all functions which makes it easily susceptible to recursion. One area was preempt_enable(). This would call the scheduler and the schedulre would call the function tracer and loop. (So was it thought). The ftrace_preempt_disable/enable was made to protect against recursion inside the scheduler by storing the NEED_RESCHED flag. If it was set before the ftrace_preempt_disable() it would not call schedule on ftrace_preempt_enable(), thinking that if it was set before then it would have already scheduled unless it was already in the scheduler. This worked fine except in the case of SMP, where another task would set the NEED_RESCHED flag for a task on another CPU, and then kick off an IPI to trigger it. This could cause the NEED_RESCHED to be saved at ftrace_preempt_disable() but the IPI to arrive in the the preempt disabled section. The ftrace_preempt_enable() would not call the scheduler because the flag was already set before entring the section. This bug would cause a missed preemption check and cause lower latencies. Investigating further, I found that the recusion caused by the function tracer was not due to schedule(), but due to preempt_schedule(). Now that preempt_schedule is completely annotated with notrace, the recusion no longer is an issue. Reported-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-06-03 21:36:50 +08:00
preempt_enable_notrace();
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
return ret;
}
EXPORT_SYMBOL_GPL(ring_buffer_write);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *reader = cpu_buffer->reader_page;
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
struct buffer_page *head = rb_set_head_page(cpu_buffer);
struct buffer_page *commit = cpu_buffer->commit_page;
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
/* In case of error, head will be NULL */
if (unlikely(!head))
return true;
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
tracing: Fix bug in rb_per_cpu_empty() that might cause deadloop. The "rb_per_cpu_empty()" misinterpret the condition (as not-empty) when "head_page" and "commit_page" of "struct ring_buffer_per_cpu" points to the same buffer page, whose "buffer_data_page" is empty and "read" field is non-zero. An error scenario could be constructed as followed (kernel perspective): 1. All pages in the buffer has been accessed by reader(s) so that all of them will have non-zero "read" field. 2. Read and clear all buffer pages so that "rb_num_of_entries()" will return 0 rendering there's no more data to read. It is also required that the "read_page", "commit_page" and "tail_page" points to the same page, while "head_page" is the next page of them. 3. Invoke "ring_buffer_lock_reserve()" with large enough "length" so that it shot pass the end of current tail buffer page. Now the "head_page", "commit_page" and "tail_page" points to the same page. 4. Discard current event with "ring_buffer_discard_commit()", so that "head_page", "commit_page" and "tail_page" points to a page whose buffer data page is now empty. When the error scenario has been constructed, "tracing_read_pipe" will be trapped inside a deadloop: "trace_empty()" returns 0 since "rb_per_cpu_empty()" returns 0 when it hits the CPU containing such constructed ring buffer. Then "trace_find_next_entry_inc()" always return NULL since "rb_num_of_entries()" reports there's no more entry to read. Finally "trace_seq_to_user()" returns "-EBUSY" spanking "tracing_read_pipe" back to the start of the "waitagain" loop. I've also written a proof-of-concept script to construct the scenario and trigger the bug automatically, you can use it to trace and validate my reasoning above: https://github.com/aegistudio/RingBufferDetonator.git Tests has been carried out on linux kernel 5.14-rc2 (2734d6c1b1a089fb593ef6a23d4b70903526fe0c), my fixed version of kernel (for testing whether my update fixes the bug) and some older kernels (for range of affected kernels). Test result is also attached to the proof-of-concept repository. Link: https://lore.kernel.org/linux-trace-devel/YPaNxsIlb2yjSi5Y@aegistudio/ Link: https://lore.kernel.org/linux-trace-devel/YPgrN85WL9VyrZ55@aegistudio Cc: stable@vger.kernel.org Fixes: bf41a158cacba ("ring-buffer: make reentrant") Suggested-by: Linus Torvalds <torvalds@linuxfoundation.org> Signed-off-by: Haoran Luo <www@aegistudio.net> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2021-07-21 22:12:07 +08:00
/* Reader should exhaust content in reader page */
if (reader->read != rb_page_commit(reader))
return false;
/*
* If writers are committing on the reader page, knowing all
* committed content has been read, the ring buffer is empty.
*/
if (commit == reader)
return true;
/*
* If writers are committing on a page other than reader page
* and head page, there should always be content to read.
*/
if (commit != head)
return false;
/*
* Writers are committing on the head page, we just need
* to care about there're committed data, and the reader will
* swap reader page with head page when it is to read data.
*/
return rb_page_commit(commit) == 0;
}
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/**
* ring_buffer_record_disable - stop all writes into the buffer
* @buffer: The ring buffer to stop writes to.
*
* This prevents all writes to the buffer. Any attempt to write
* to the buffer after this will fail and return NULL.
*
* The caller should call synchronize_rcu() after this.
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
*/
void ring_buffer_record_disable(struct trace_buffer *buffer)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
atomic_inc(&buffer->record_disabled);
}
EXPORT_SYMBOL_GPL(ring_buffer_record_disable);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/**
* ring_buffer_record_enable - enable writes to the buffer
* @buffer: The ring buffer to enable writes
*
* Note, multiple disables will need the same number of enables
* to truly enable the writing (much like preempt_disable).
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
*/
void ring_buffer_record_enable(struct trace_buffer *buffer)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
atomic_dec(&buffer->record_disabled);
}
EXPORT_SYMBOL_GPL(ring_buffer_record_enable);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/**
* ring_buffer_record_off - stop all writes into the buffer
* @buffer: The ring buffer to stop writes to.
*
* This prevents all writes to the buffer. Any attempt to write
* to the buffer after this will fail and return NULL.
*
* This is different than ring_buffer_record_disable() as
* it works like an on/off switch, where as the disable() version
* must be paired with a enable().
*/
void ring_buffer_record_off(struct trace_buffer *buffer)
{
unsigned int rd;
unsigned int new_rd;
rd = atomic_read(&buffer->record_disabled);
do {
new_rd = rd | RB_BUFFER_OFF;
} while (!atomic_try_cmpxchg(&buffer->record_disabled, &rd, new_rd));
}
EXPORT_SYMBOL_GPL(ring_buffer_record_off);
/**
* ring_buffer_record_on - restart writes into the buffer
* @buffer: The ring buffer to start writes to.
*
* This enables all writes to the buffer that was disabled by
* ring_buffer_record_off().
*
* This is different than ring_buffer_record_enable() as
* it works like an on/off switch, where as the enable() version
* must be paired with a disable().
*/
void ring_buffer_record_on(struct trace_buffer *buffer)
{
unsigned int rd;
unsigned int new_rd;
rd = atomic_read(&buffer->record_disabled);
do {
new_rd = rd & ~RB_BUFFER_OFF;
} while (!atomic_try_cmpxchg(&buffer->record_disabled, &rd, new_rd));
}
EXPORT_SYMBOL_GPL(ring_buffer_record_on);
/**
* ring_buffer_record_is_on - return true if the ring buffer can write
* @buffer: The ring buffer to see if write is enabled
*
* Returns true if the ring buffer is in a state that it accepts writes.
*/
bool ring_buffer_record_is_on(struct trace_buffer *buffer)
{
return !atomic_read(&buffer->record_disabled);
}
ring_buffer: tracing: Inherit the tracing setting to next ring buffer Maintain the tracing on/off setting of the ring_buffer when switching to the trace buffer snapshot. Taking a snapshot is done by swapping the backup ring buffer (max_tr_buffer). But since the tracing on/off setting is defined by the ring buffer, when swapping it, the tracing on/off setting can also be changed. This causes a strange result like below: /sys/kernel/debug/tracing # cat tracing_on 1 /sys/kernel/debug/tracing # echo 0 > tracing_on /sys/kernel/debug/tracing # cat tracing_on 0 /sys/kernel/debug/tracing # echo 1 > snapshot /sys/kernel/debug/tracing # cat tracing_on 1 /sys/kernel/debug/tracing # echo 1 > snapshot /sys/kernel/debug/tracing # cat tracing_on 0 We don't touch tracing_on, but snapshot changes tracing_on setting each time. This is an anomaly, because user doesn't know that each "ring_buffer" stores its own tracing-enable state and the snapshot is done by swapping ring buffers. Link: http://lkml.kernel.org/r/153149929558.11274.11730609978254724394.stgit@devbox Cc: Ingo Molnar <mingo@redhat.com> Cc: Shuah Khan <shuah@kernel.org> Cc: Tom Zanussi <tom.zanussi@linux.intel.com> Cc: Hiraku Toyooka <hiraku.toyooka@cybertrust.co.jp> Cc: stable@vger.kernel.org Fixes: debdd57f5145 ("tracing: Make a snapshot feature available from userspace") Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org> [ Updated commit log and comment in the code ] Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2018-07-14 00:28:15 +08:00
/**
* ring_buffer_record_is_set_on - return true if the ring buffer is set writable
* @buffer: The ring buffer to see if write is set enabled
*
* Returns true if the ring buffer is set writable by ring_buffer_record_on().
* Note that this does NOT mean it is in a writable state.
*
* It may return true when the ring buffer has been disabled by
* ring_buffer_record_disable(), as that is a temporary disabling of
* the ring buffer.
*/
bool ring_buffer_record_is_set_on(struct trace_buffer *buffer)
ring_buffer: tracing: Inherit the tracing setting to next ring buffer Maintain the tracing on/off setting of the ring_buffer when switching to the trace buffer snapshot. Taking a snapshot is done by swapping the backup ring buffer (max_tr_buffer). But since the tracing on/off setting is defined by the ring buffer, when swapping it, the tracing on/off setting can also be changed. This causes a strange result like below: /sys/kernel/debug/tracing # cat tracing_on 1 /sys/kernel/debug/tracing # echo 0 > tracing_on /sys/kernel/debug/tracing # cat tracing_on 0 /sys/kernel/debug/tracing # echo 1 > snapshot /sys/kernel/debug/tracing # cat tracing_on 1 /sys/kernel/debug/tracing # echo 1 > snapshot /sys/kernel/debug/tracing # cat tracing_on 0 We don't touch tracing_on, but snapshot changes tracing_on setting each time. This is an anomaly, because user doesn't know that each "ring_buffer" stores its own tracing-enable state and the snapshot is done by swapping ring buffers. Link: http://lkml.kernel.org/r/153149929558.11274.11730609978254724394.stgit@devbox Cc: Ingo Molnar <mingo@redhat.com> Cc: Shuah Khan <shuah@kernel.org> Cc: Tom Zanussi <tom.zanussi@linux.intel.com> Cc: Hiraku Toyooka <hiraku.toyooka@cybertrust.co.jp> Cc: stable@vger.kernel.org Fixes: debdd57f5145 ("tracing: Make a snapshot feature available from userspace") Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org> [ Updated commit log and comment in the code ] Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2018-07-14 00:28:15 +08:00
{
return !(atomic_read(&buffer->record_disabled) & RB_BUFFER_OFF);
}
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/**
* ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
* @buffer: The ring buffer to stop writes to.
* @cpu: The CPU buffer to stop
*
* This prevents all writes to the buffer. Any attempt to write
* to the buffer after this will fail and return NULL.
*
* The caller should call synchronize_rcu() after this.
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
*/
void ring_buffer_record_disable_cpu(struct trace_buffer *buffer, int cpu)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
struct ring_buffer_per_cpu *cpu_buffer;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
cpu_buffer = buffer->buffers[cpu];
atomic_inc(&cpu_buffer->record_disabled);
}
EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/**
* ring_buffer_record_enable_cpu - enable writes to the buffer
* @buffer: The ring buffer to enable writes
* @cpu: The CPU to enable.
*
* Note, multiple disables will need the same number of enables
* to truly enable the writing (much like preempt_disable).
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
*/
void ring_buffer_record_enable_cpu(struct trace_buffer *buffer, int cpu)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
struct ring_buffer_per_cpu *cpu_buffer;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
cpu_buffer = buffer->buffers[cpu];
atomic_dec(&cpu_buffer->record_disabled);
}
EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/*
* The total entries in the ring buffer is the running counter
* of entries entered into the ring buffer, minus the sum of
* the entries read from the ring buffer and the number of
* entries that were overwritten.
*/
static inline unsigned long
rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
{
return local_read(&cpu_buffer->entries) -
(local_read(&cpu_buffer->overrun) + cpu_buffer->read);
}
/**
* ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer
* @buffer: The ring buffer
* @cpu: The per CPU buffer to read from.
*/
u64 ring_buffer_oldest_event_ts(struct trace_buffer *buffer, int cpu)
{
unsigned long flags;
struct ring_buffer_per_cpu *cpu_buffer;
struct buffer_page *bpage;
u64 ret = 0;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return 0;
cpu_buffer = buffer->buffers[cpu];
Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip * 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (121 commits) perf symbols: Increase symbol KSYM_NAME_LEN size perf hists browser: Refuse 'a' hotkey on non symbolic views perf ui browser: Use libslang to read keys perf tools: Fix tracing info recording perf hists browser: Elide DSO column when it is set to just one DSO, ditto for threads perf hists: Don't consider filtered entries when calculating column widths perf hists: Don't decay total_period for filtered entries perf hists browser: Honour symbol_conf.show_{nr_samples,total_period} perf hists browser: Do not exit on tab key with single event perf annotate browser: Don't change selection line when returning from callq perf tools: handle endianness of feature bitmap perf tools: Add prelink suggestion to dso update message perf script: Fix unknown feature comment perf hists browser: Apply the dso and thread filters when merging new batches perf hists: Move the dso and thread filters from hist_browser perf ui browser: Honour the xterm colors perf top tui: Give color hints just on the percentage, like on --stdio perf ui browser: Make the colors configurable and change the defaults perf tui: Remove unneeded call to newtCls on startup perf hists: Don't format the percentage on hist_entry__snprintf ... Fix up conflicts in arch/x86/kernel/kprobes.c manually. Ingo's tree did the insane "add volatile to const array", which just doesn't make sense ("volatile const"?). But we could remove the const *and* make the array volatile to make doubly sure that gcc doesn't optimize it away.. Also fix up kernel/trace/ring_buffer.c non-data-conflicts manually: the reader_lock has been turned into a raw lock by the core locking merge, and there was a new user of it introduced in this perf core merge. Make sure that new use also uses the raw accessor functions.
2011-10-26 23:03:38 +08:00
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
/*
* if the tail is on reader_page, oldest time stamp is on the reader
* page
*/
if (cpu_buffer->tail_page == cpu_buffer->reader_page)
bpage = cpu_buffer->reader_page;
else
bpage = rb_set_head_page(cpu_buffer);
if (bpage)
ret = bpage->page->time_stamp;
Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip * 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (121 commits) perf symbols: Increase symbol KSYM_NAME_LEN size perf hists browser: Refuse 'a' hotkey on non symbolic views perf ui browser: Use libslang to read keys perf tools: Fix tracing info recording perf hists browser: Elide DSO column when it is set to just one DSO, ditto for threads perf hists: Don't consider filtered entries when calculating column widths perf hists: Don't decay total_period for filtered entries perf hists browser: Honour symbol_conf.show_{nr_samples,total_period} perf hists browser: Do not exit on tab key with single event perf annotate browser: Don't change selection line when returning from callq perf tools: handle endianness of feature bitmap perf tools: Add prelink suggestion to dso update message perf script: Fix unknown feature comment perf hists browser: Apply the dso and thread filters when merging new batches perf hists: Move the dso and thread filters from hist_browser perf ui browser: Honour the xterm colors perf top tui: Give color hints just on the percentage, like on --stdio perf ui browser: Make the colors configurable and change the defaults perf tui: Remove unneeded call to newtCls on startup perf hists: Don't format the percentage on hist_entry__snprintf ... Fix up conflicts in arch/x86/kernel/kprobes.c manually. Ingo's tree did the insane "add volatile to const array", which just doesn't make sense ("volatile const"?). But we could remove the const *and* make the array volatile to make doubly sure that gcc doesn't optimize it away.. Also fix up kernel/trace/ring_buffer.c non-data-conflicts manually: the reader_lock has been turned into a raw lock by the core locking merge, and there was a new user of it introduced in this perf core merge. Make sure that new use also uses the raw accessor functions.
2011-10-26 23:03:38 +08:00
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts);
/**
* ring_buffer_bytes_cpu - get the number of bytes unconsumed in a cpu buffer
* @buffer: The ring buffer
* @cpu: The per CPU buffer to read from.
*/
unsigned long ring_buffer_bytes_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long ret;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return 0;
cpu_buffer = buffer->buffers[cpu];
ret = local_read(&cpu_buffer->entries_bytes) - cpu_buffer->read_bytes;
return ret;
}
EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/**
* ring_buffer_entries_cpu - get the number of entries in a cpu buffer
* @buffer: The ring buffer
* @cpu: The per CPU buffer to get the entries from.
*/
unsigned long ring_buffer_entries_cpu(struct trace_buffer *buffer, int cpu)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
struct ring_buffer_per_cpu *cpu_buffer;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return 0;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
cpu_buffer = buffer->buffers[cpu];
return rb_num_of_entries(cpu_buffer);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/**
* ring_buffer_overrun_cpu - get the number of overruns caused by the ring
* buffer wrapping around (only if RB_FL_OVERWRITE is on).
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
* @buffer: The ring buffer
* @cpu: The per CPU buffer to get the number of overruns from
*/
unsigned long ring_buffer_overrun_cpu(struct trace_buffer *buffer, int cpu)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long ret;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return 0;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
cpu_buffer = buffer->buffers[cpu];
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
ret = local_read(&cpu_buffer->overrun);
return ret;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/**
* ring_buffer_commit_overrun_cpu - get the number of overruns caused by
* commits failing due to the buffer wrapping around while there are uncommitted
* events, such as during an interrupt storm.
* @buffer: The ring buffer
* @cpu: The per CPU buffer to get the number of overruns from
*/
unsigned long
ring_buffer_commit_overrun_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long ret;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return 0;
cpu_buffer = buffer->buffers[cpu];
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
ret = local_read(&cpu_buffer->commit_overrun);
return ret;
}
EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu);
/**
* ring_buffer_dropped_events_cpu - get the number of dropped events caused by
* the ring buffer filling up (only if RB_FL_OVERWRITE is off).
* @buffer: The ring buffer
* @cpu: The per CPU buffer to get the number of overruns from
*/
unsigned long
ring_buffer_dropped_events_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long ret;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return 0;
cpu_buffer = buffer->buffers[cpu];
ret = local_read(&cpu_buffer->dropped_events);
return ret;
}
EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu);
/**
* ring_buffer_read_events_cpu - get the number of events successfully read
* @buffer: The ring buffer
* @cpu: The per CPU buffer to get the number of events read
*/
unsigned long
ring_buffer_read_events_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return 0;
cpu_buffer = buffer->buffers[cpu];
return cpu_buffer->read;
}
EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/**
* ring_buffer_entries - get the number of entries in a buffer
* @buffer: The ring buffer
*
* Returns the total number of entries in the ring buffer
* (all CPU entries)
*/
unsigned long ring_buffer_entries(struct trace_buffer *buffer)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long entries = 0;
int cpu;
/* if you care about this being correct, lock the buffer */
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
entries += rb_num_of_entries(cpu_buffer);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
return entries;
}
EXPORT_SYMBOL_GPL(ring_buffer_entries);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/**
* ring_buffer_overruns - get the number of overruns in buffer
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
* @buffer: The ring buffer
*
* Returns the total number of overruns in the ring buffer
* (all CPU entries)
*/
unsigned long ring_buffer_overruns(struct trace_buffer *buffer)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long overruns = 0;
int cpu;
/* if you care about this being correct, lock the buffer */
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
overruns += local_read(&cpu_buffer->overrun);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
return overruns;
}
EXPORT_SYMBOL_GPL(ring_buffer_overruns);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
static void rb_iter_reset(struct ring_buffer_iter *iter)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
/* Iterator usage is expected to have record disabled */
ring-buffer: Always reset iterator to reader page When performing a consuming read, the ring buffer swaps out a page from the ring buffer with a empty page and this page that was swapped out becomes the new reader page. The reader page is owned by the reader and since it was swapped out of the ring buffer, writers do not have access to it (there's an exception to that rule, but it's out of scope for this commit). When reading the "trace" file, it is a non consuming read, which means that the data in the ring buffer will not be modified. When the trace file is opened, a ring buffer iterator is allocated and writes to the ring buffer are disabled, such that the iterator will not have issues iterating over the data. Although the ring buffer disabled writes, it does not disable other reads, or even consuming reads. If a consuming read happens, then the iterator is reset and starts reading from the beginning again. My tests would sometimes trigger this bug on my i386 box: WARNING: CPU: 0 PID: 5175 at kernel/trace/trace.c:1527 __trace_find_cmdline+0x66/0xaa() Modules linked in: CPU: 0 PID: 5175 Comm: grep Not tainted 3.16.0-rc3-test+ #8 Hardware name: /DG965MQ, BIOS MQ96510J.86A.0372.2006.0605.1717 06/05/2006 00000000 00000000 f09c9e1c c18796b3 c1b5d74c f09c9e4c c103a0e3 c1b5154b f09c9e78 00001437 c1b5d74c 000005f7 c10bd85a c10bd85a c1cac57c f09c9eb0 ed0e0000 f09c9e64 c103a185 00000009 f09c9e5c c1b5154b f09c9e78 f09c9e80^M Call Trace: [<c18796b3>] dump_stack+0x4b/0x75 [<c103a0e3>] warn_slowpath_common+0x7e/0x95 [<c10bd85a>] ? __trace_find_cmdline+0x66/0xaa [<c10bd85a>] ? __trace_find_cmdline+0x66/0xaa [<c103a185>] warn_slowpath_fmt+0x33/0x35 [<c10bd85a>] __trace_find_cmdline+0x66/0xaa^M [<c10bed04>] trace_find_cmdline+0x40/0x64 [<c10c3c16>] trace_print_context+0x27/0xec [<c10c4360>] ? trace_seq_printf+0x37/0x5b [<c10c0b15>] print_trace_line+0x319/0x39b [<c10ba3fb>] ? ring_buffer_read+0x47/0x50 [<c10c13b1>] s_show+0x192/0x1ab [<c10bfd9a>] ? s_next+0x5a/0x7c [<c112e76e>] seq_read+0x267/0x34c [<c1115a25>] vfs_read+0x8c/0xef [<c112e507>] ? seq_lseek+0x154/0x154 [<c1115ba2>] SyS_read+0x54/0x7f [<c188488e>] syscall_call+0x7/0xb ---[ end trace 3f507febd6b4cc83 ]--- >>>> ##### CPU 1 buffer started #### Which was the __trace_find_cmdline() function complaining about the pid in the event record being negative. After adding more test cases, this would trigger more often. Strangely enough, it would never trigger on a single test, but instead would trigger only when running all the tests. I believe that was the case because it required one of the tests to be shutting down via delayed instances while a new test started up. After spending several days debugging this, I found that it was caused by the iterator becoming corrupted. Debugging further, I found out why the iterator became corrupted. It happened with the rb_iter_reset(). As consuming reads may not read the full reader page, and only part of it, there's a "read" field to know where the last read took place. The iterator, must also start at the read position. In the rb_iter_reset() code, if the reader page was disconnected from the ring buffer, the iterator would start at the head page within the ring buffer (where writes still happen). But the mistake there was that it still used the "read" field to start the iterator on the head page, where it should always start at zero because readers never read from within the ring buffer where writes occur. I originally wrote a patch to have it set the iter->head to 0 instead of iter->head_page->read, but then I questioned why it wasn't always setting the iter to point to the reader page, as the reader page is still valid. The list_empty(reader_page->list) just means that it was successful in swapping out. But the reader_page may still have data. There was a bug report a long time ago that was not reproducible that had something about trace_pipe (consuming read) not matching trace (iterator read). This may explain why that happened. Anyway, the correct answer to this bug is to always use the reader page an not reset the iterator to inside the writable ring buffer. Cc: stable@vger.kernel.org # 2.6.28+ Fixes: d769041f8653 "ring_buffer: implement new locking" Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2014-08-07 02:11:33 +08:00
iter->head_page = cpu_buffer->reader_page;
iter->head = cpu_buffer->reader_page->read;
iter->next_event = iter->head;
ring-buffer: Always reset iterator to reader page When performing a consuming read, the ring buffer swaps out a page from the ring buffer with a empty page and this page that was swapped out becomes the new reader page. The reader page is owned by the reader and since it was swapped out of the ring buffer, writers do not have access to it (there's an exception to that rule, but it's out of scope for this commit). When reading the "trace" file, it is a non consuming read, which means that the data in the ring buffer will not be modified. When the trace file is opened, a ring buffer iterator is allocated and writes to the ring buffer are disabled, such that the iterator will not have issues iterating over the data. Although the ring buffer disabled writes, it does not disable other reads, or even consuming reads. If a consuming read happens, then the iterator is reset and starts reading from the beginning again. My tests would sometimes trigger this bug on my i386 box: WARNING: CPU: 0 PID: 5175 at kernel/trace/trace.c:1527 __trace_find_cmdline+0x66/0xaa() Modules linked in: CPU: 0 PID: 5175 Comm: grep Not tainted 3.16.0-rc3-test+ #8 Hardware name: /DG965MQ, BIOS MQ96510J.86A.0372.2006.0605.1717 06/05/2006 00000000 00000000 f09c9e1c c18796b3 c1b5d74c f09c9e4c c103a0e3 c1b5154b f09c9e78 00001437 c1b5d74c 000005f7 c10bd85a c10bd85a c1cac57c f09c9eb0 ed0e0000 f09c9e64 c103a185 00000009 f09c9e5c c1b5154b f09c9e78 f09c9e80^M Call Trace: [<c18796b3>] dump_stack+0x4b/0x75 [<c103a0e3>] warn_slowpath_common+0x7e/0x95 [<c10bd85a>] ? __trace_find_cmdline+0x66/0xaa [<c10bd85a>] ? __trace_find_cmdline+0x66/0xaa [<c103a185>] warn_slowpath_fmt+0x33/0x35 [<c10bd85a>] __trace_find_cmdline+0x66/0xaa^M [<c10bed04>] trace_find_cmdline+0x40/0x64 [<c10c3c16>] trace_print_context+0x27/0xec [<c10c4360>] ? trace_seq_printf+0x37/0x5b [<c10c0b15>] print_trace_line+0x319/0x39b [<c10ba3fb>] ? ring_buffer_read+0x47/0x50 [<c10c13b1>] s_show+0x192/0x1ab [<c10bfd9a>] ? s_next+0x5a/0x7c [<c112e76e>] seq_read+0x267/0x34c [<c1115a25>] vfs_read+0x8c/0xef [<c112e507>] ? seq_lseek+0x154/0x154 [<c1115ba2>] SyS_read+0x54/0x7f [<c188488e>] syscall_call+0x7/0xb ---[ end trace 3f507febd6b4cc83 ]--- >>>> ##### CPU 1 buffer started #### Which was the __trace_find_cmdline() function complaining about the pid in the event record being negative. After adding more test cases, this would trigger more often. Strangely enough, it would never trigger on a single test, but instead would trigger only when running all the tests. I believe that was the case because it required one of the tests to be shutting down via delayed instances while a new test started up. After spending several days debugging this, I found that it was caused by the iterator becoming corrupted. Debugging further, I found out why the iterator became corrupted. It happened with the rb_iter_reset(). As consuming reads may not read the full reader page, and only part of it, there's a "read" field to know where the last read took place. The iterator, must also start at the read position. In the rb_iter_reset() code, if the reader page was disconnected from the ring buffer, the iterator would start at the head page within the ring buffer (where writes still happen). But the mistake there was that it still used the "read" field to start the iterator on the head page, where it should always start at zero because readers never read from within the ring buffer where writes occur. I originally wrote a patch to have it set the iter->head to 0 instead of iter->head_page->read, but then I questioned why it wasn't always setting the iter to point to the reader page, as the reader page is still valid. The list_empty(reader_page->list) just means that it was successful in swapping out. But the reader_page may still have data. There was a bug report a long time ago that was not reproducible that had something about trace_pipe (consuming read) not matching trace (iterator read). This may explain why that happened. Anyway, the correct answer to this bug is to always use the reader page an not reset the iterator to inside the writable ring buffer. Cc: stable@vger.kernel.org # 2.6.28+ Fixes: d769041f8653 "ring_buffer: implement new locking" Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2014-08-07 02:11:33 +08:00
iter->cache_reader_page = iter->head_page;
ring-buffer: Fix infinite spin in reading buffer Commit 651e22f2701b "ring-buffer: Always reset iterator to reader page" fixed one bug but in the process caused another one. The reset is to update the header page, but that fix also changed the way the cached reads were updated. The cache reads are used to test if an iterator needs to be updated or not. A ring buffer iterator, when created, disables writes to the ring buffer but does not stop other readers or consuming reads from happening. Although all readers are synchronized via a lock, they are only synchronized when in the ring buffer functions. Those functions may be called by any number of readers. The iterator continues down when its not interrupted by a consuming reader. If a consuming read occurs, the iterator starts from the beginning of the buffer. The way the iterator sees that a consuming read has happened since its last read is by checking the reader "cache". The cache holds the last counts of the read and the reader page itself. Commit 651e22f2701b changed what was saved by the cache_read when the rb_iter_reset() occurred, making the iterator never match the cache. Then if the iterator calls rb_iter_reset(), it will go into an infinite loop by checking if the cache doesn't match, doing the reset and retrying, just to see that the cache still doesn't match! Which should never happen as the reset is suppose to set the cache to the current value and there's locks that keep a consuming reader from having access to the data. Fixes: 651e22f2701b "ring-buffer: Always reset iterator to reader page" Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2014-10-03 04:51:18 +08:00
iter->cache_read = cpu_buffer->read;
iter->cache_pages_removed = cpu_buffer->pages_removed;
ring-buffer: Always reset iterator to reader page When performing a consuming read, the ring buffer swaps out a page from the ring buffer with a empty page and this page that was swapped out becomes the new reader page. The reader page is owned by the reader and since it was swapped out of the ring buffer, writers do not have access to it (there's an exception to that rule, but it's out of scope for this commit). When reading the "trace" file, it is a non consuming read, which means that the data in the ring buffer will not be modified. When the trace file is opened, a ring buffer iterator is allocated and writes to the ring buffer are disabled, such that the iterator will not have issues iterating over the data. Although the ring buffer disabled writes, it does not disable other reads, or even consuming reads. If a consuming read happens, then the iterator is reset and starts reading from the beginning again. My tests would sometimes trigger this bug on my i386 box: WARNING: CPU: 0 PID: 5175 at kernel/trace/trace.c:1527 __trace_find_cmdline+0x66/0xaa() Modules linked in: CPU: 0 PID: 5175 Comm: grep Not tainted 3.16.0-rc3-test+ #8 Hardware name: /DG965MQ, BIOS MQ96510J.86A.0372.2006.0605.1717 06/05/2006 00000000 00000000 f09c9e1c c18796b3 c1b5d74c f09c9e4c c103a0e3 c1b5154b f09c9e78 00001437 c1b5d74c 000005f7 c10bd85a c10bd85a c1cac57c f09c9eb0 ed0e0000 f09c9e64 c103a185 00000009 f09c9e5c c1b5154b f09c9e78 f09c9e80^M Call Trace: [<c18796b3>] dump_stack+0x4b/0x75 [<c103a0e3>] warn_slowpath_common+0x7e/0x95 [<c10bd85a>] ? __trace_find_cmdline+0x66/0xaa [<c10bd85a>] ? __trace_find_cmdline+0x66/0xaa [<c103a185>] warn_slowpath_fmt+0x33/0x35 [<c10bd85a>] __trace_find_cmdline+0x66/0xaa^M [<c10bed04>] trace_find_cmdline+0x40/0x64 [<c10c3c16>] trace_print_context+0x27/0xec [<c10c4360>] ? trace_seq_printf+0x37/0x5b [<c10c0b15>] print_trace_line+0x319/0x39b [<c10ba3fb>] ? ring_buffer_read+0x47/0x50 [<c10c13b1>] s_show+0x192/0x1ab [<c10bfd9a>] ? s_next+0x5a/0x7c [<c112e76e>] seq_read+0x267/0x34c [<c1115a25>] vfs_read+0x8c/0xef [<c112e507>] ? seq_lseek+0x154/0x154 [<c1115ba2>] SyS_read+0x54/0x7f [<c188488e>] syscall_call+0x7/0xb ---[ end trace 3f507febd6b4cc83 ]--- >>>> ##### CPU 1 buffer started #### Which was the __trace_find_cmdline() function complaining about the pid in the event record being negative. After adding more test cases, this would trigger more often. Strangely enough, it would never trigger on a single test, but instead would trigger only when running all the tests. I believe that was the case because it required one of the tests to be shutting down via delayed instances while a new test started up. After spending several days debugging this, I found that it was caused by the iterator becoming corrupted. Debugging further, I found out why the iterator became corrupted. It happened with the rb_iter_reset(). As consuming reads may not read the full reader page, and only part of it, there's a "read" field to know where the last read took place. The iterator, must also start at the read position. In the rb_iter_reset() code, if the reader page was disconnected from the ring buffer, the iterator would start at the head page within the ring buffer (where writes still happen). But the mistake there was that it still used the "read" field to start the iterator on the head page, where it should always start at zero because readers never read from within the ring buffer where writes occur. I originally wrote a patch to have it set the iter->head to 0 instead of iter->head_page->read, but then I questioned why it wasn't always setting the iter to point to the reader page, as the reader page is still valid. The list_empty(reader_page->list) just means that it was successful in swapping out. But the reader_page may still have data. There was a bug report a long time ago that was not reproducible that had something about trace_pipe (consuming read) not matching trace (iterator read). This may explain why that happened. Anyway, the correct answer to this bug is to always use the reader page an not reset the iterator to inside the writable ring buffer. Cc: stable@vger.kernel.org # 2.6.28+ Fixes: d769041f8653 "ring_buffer: implement new locking" Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2014-08-07 02:11:33 +08:00
if (iter->head) {
iter->read_stamp = cpu_buffer->read_stamp;
iter->page_stamp = cpu_buffer->reader_page->page->time_stamp;
} else {
iter->read_stamp = iter->head_page->page->time_stamp;
iter->page_stamp = iter->read_stamp;
}
}
/**
* ring_buffer_iter_reset - reset an iterator
* @iter: The iterator to reset
*
* Resets the iterator, so that it will start from the beginning
* again.
*/
void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
if (!iter)
return;
cpu_buffer = iter->cpu_buffer;
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
rb_iter_reset(iter);
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
EXPORT_SYMBOL_GPL(ring_buffer_iter_reset);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/**
* ring_buffer_iter_empty - check if an iterator has no more to read
* @iter: The iterator to check
*/
int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
{
struct ring_buffer_per_cpu *cpu_buffer;
ring-buffer: Have ring_buffer_iter_empty() return true when empty I noticed that reading the snapshot file when it is empty no longer gives a status. It suppose to show the status of the snapshot buffer as well as how to allocate and use it. For example: ># cat snapshot # tracer: nop # # # * Snapshot is allocated * # # Snapshot commands: # echo 0 > snapshot : Clears and frees snapshot buffer # echo 1 > snapshot : Allocates snapshot buffer, if not already allocated. # Takes a snapshot of the main buffer. # echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free) # (Doesn't have to be '2' works with any number that # is not a '0' or '1') But instead it just showed an empty buffer: ># cat snapshot # tracer: nop # # entries-in-buffer/entries-written: 0/0 #P:4 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | What happened was that it was using the ring_buffer_iter_empty() function to see if it was empty, and if it was, it showed the status. But that function was returning false when it was empty. The reason was that the iter header page was on the reader page, and the reader page was empty, but so was the buffer itself. The check only tested to see if the iter was on the commit page, but the commit page was no longer pointing to the reader page, but as all pages were empty, the buffer is also. Cc: stable@vger.kernel.org Fixes: 651e22f2701b ("ring-buffer: Always reset iterator to reader page") Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:29:46 +08:00
struct buffer_page *reader;
struct buffer_page *head_page;
struct buffer_page *commit_page;
struct buffer_page *curr_commit_page;
ring-buffer: Have ring_buffer_iter_empty() return true when empty I noticed that reading the snapshot file when it is empty no longer gives a status. It suppose to show the status of the snapshot buffer as well as how to allocate and use it. For example: ># cat snapshot # tracer: nop # # # * Snapshot is allocated * # # Snapshot commands: # echo 0 > snapshot : Clears and frees snapshot buffer # echo 1 > snapshot : Allocates snapshot buffer, if not already allocated. # Takes a snapshot of the main buffer. # echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free) # (Doesn't have to be '2' works with any number that # is not a '0' or '1') But instead it just showed an empty buffer: ># cat snapshot # tracer: nop # # entries-in-buffer/entries-written: 0/0 #P:4 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | What happened was that it was using the ring_buffer_iter_empty() function to see if it was empty, and if it was, it showed the status. But that function was returning false when it was empty. The reason was that the iter header page was on the reader page, and the reader page was empty, but so was the buffer itself. The check only tested to see if the iter was on the commit page, but the commit page was no longer pointing to the reader page, but as all pages were empty, the buffer is also. Cc: stable@vger.kernel.org Fixes: 651e22f2701b ("ring-buffer: Always reset iterator to reader page") Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:29:46 +08:00
unsigned commit;
u64 curr_commit_ts;
u64 commit_ts;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
cpu_buffer = iter->cpu_buffer;
ring-buffer: Have ring_buffer_iter_empty() return true when empty I noticed that reading the snapshot file when it is empty no longer gives a status. It suppose to show the status of the snapshot buffer as well as how to allocate and use it. For example: ># cat snapshot # tracer: nop # # # * Snapshot is allocated * # # Snapshot commands: # echo 0 > snapshot : Clears and frees snapshot buffer # echo 1 > snapshot : Allocates snapshot buffer, if not already allocated. # Takes a snapshot of the main buffer. # echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free) # (Doesn't have to be '2' works with any number that # is not a '0' or '1') But instead it just showed an empty buffer: ># cat snapshot # tracer: nop # # entries-in-buffer/entries-written: 0/0 #P:4 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | What happened was that it was using the ring_buffer_iter_empty() function to see if it was empty, and if it was, it showed the status. But that function was returning false when it was empty. The reason was that the iter header page was on the reader page, and the reader page was empty, but so was the buffer itself. The check only tested to see if the iter was on the commit page, but the commit page was no longer pointing to the reader page, but as all pages were empty, the buffer is also. Cc: stable@vger.kernel.org Fixes: 651e22f2701b ("ring-buffer: Always reset iterator to reader page") Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:29:46 +08:00
reader = cpu_buffer->reader_page;
head_page = cpu_buffer->head_page;
commit_page = cpu_buffer->commit_page;
commit_ts = commit_page->page->time_stamp;
/*
* When the writer goes across pages, it issues a cmpxchg which
* is a mb(), which will synchronize with the rmb here.
* (see rb_tail_page_update())
*/
smp_rmb();
ring-buffer: Have ring_buffer_iter_empty() return true when empty I noticed that reading the snapshot file when it is empty no longer gives a status. It suppose to show the status of the snapshot buffer as well as how to allocate and use it. For example: ># cat snapshot # tracer: nop # # # * Snapshot is allocated * # # Snapshot commands: # echo 0 > snapshot : Clears and frees snapshot buffer # echo 1 > snapshot : Allocates snapshot buffer, if not already allocated. # Takes a snapshot of the main buffer. # echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free) # (Doesn't have to be '2' works with any number that # is not a '0' or '1') But instead it just showed an empty buffer: ># cat snapshot # tracer: nop # # entries-in-buffer/entries-written: 0/0 #P:4 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | What happened was that it was using the ring_buffer_iter_empty() function to see if it was empty, and if it was, it showed the status. But that function was returning false when it was empty. The reason was that the iter header page was on the reader page, and the reader page was empty, but so was the buffer itself. The check only tested to see if the iter was on the commit page, but the commit page was no longer pointing to the reader page, but as all pages were empty, the buffer is also. Cc: stable@vger.kernel.org Fixes: 651e22f2701b ("ring-buffer: Always reset iterator to reader page") Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:29:46 +08:00
commit = rb_page_commit(commit_page);
/* We want to make sure that the commit page doesn't change */
smp_rmb();
/* Make sure commit page didn't change */
curr_commit_page = READ_ONCE(cpu_buffer->commit_page);
curr_commit_ts = READ_ONCE(curr_commit_page->page->time_stamp);
/* If the commit page changed, then there's more data */
if (curr_commit_page != commit_page ||
curr_commit_ts != commit_ts)
return 0;
ring-buffer: Have ring_buffer_iter_empty() return true when empty I noticed that reading the snapshot file when it is empty no longer gives a status. It suppose to show the status of the snapshot buffer as well as how to allocate and use it. For example: ># cat snapshot # tracer: nop # # # * Snapshot is allocated * # # Snapshot commands: # echo 0 > snapshot : Clears and frees snapshot buffer # echo 1 > snapshot : Allocates snapshot buffer, if not already allocated. # Takes a snapshot of the main buffer. # echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free) # (Doesn't have to be '2' works with any number that # is not a '0' or '1') But instead it just showed an empty buffer: ># cat snapshot # tracer: nop # # entries-in-buffer/entries-written: 0/0 #P:4 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | What happened was that it was using the ring_buffer_iter_empty() function to see if it was empty, and if it was, it showed the status. But that function was returning false when it was empty. The reason was that the iter header page was on the reader page, and the reader page was empty, but so was the buffer itself. The check only tested to see if the iter was on the commit page, but the commit page was no longer pointing to the reader page, but as all pages were empty, the buffer is also. Cc: stable@vger.kernel.org Fixes: 651e22f2701b ("ring-buffer: Always reset iterator to reader page") Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:29:46 +08:00
/* Still racy, as it may return a false positive, but that's OK */
return ((iter->head_page == commit_page && iter->head >= commit) ||
ring-buffer: Have ring_buffer_iter_empty() return true when empty I noticed that reading the snapshot file when it is empty no longer gives a status. It suppose to show the status of the snapshot buffer as well as how to allocate and use it. For example: ># cat snapshot # tracer: nop # # # * Snapshot is allocated * # # Snapshot commands: # echo 0 > snapshot : Clears and frees snapshot buffer # echo 1 > snapshot : Allocates snapshot buffer, if not already allocated. # Takes a snapshot of the main buffer. # echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free) # (Doesn't have to be '2' works with any number that # is not a '0' or '1') But instead it just showed an empty buffer: ># cat snapshot # tracer: nop # # entries-in-buffer/entries-written: 0/0 #P:4 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | What happened was that it was using the ring_buffer_iter_empty() function to see if it was empty, and if it was, it showed the status. But that function was returning false when it was empty. The reason was that the iter header page was on the reader page, and the reader page was empty, but so was the buffer itself. The check only tested to see if the iter was on the commit page, but the commit page was no longer pointing to the reader page, but as all pages were empty, the buffer is also. Cc: stable@vger.kernel.org Fixes: 651e22f2701b ("ring-buffer: Always reset iterator to reader page") Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-04-20 02:29:46 +08:00
(iter->head_page == reader && commit_page == head_page &&
head_page->read == commit &&
iter->head == rb_page_commit(cpu_buffer->reader_page)));
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
EXPORT_SYMBOL_GPL(ring_buffer_iter_empty);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
static void
rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event)
{
u64 delta;
switch (event->type_len) {
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
case RINGBUF_TYPE_PADDING:
return;
case RINGBUF_TYPE_TIME_EXTEND:
delta = rb_event_time_stamp(event);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
cpu_buffer->read_stamp += delta;
return;
case RINGBUF_TYPE_TIME_STAMP:
delta = rb_event_time_stamp(event);
delta = rb_fix_abs_ts(delta, cpu_buffer->read_stamp);
cpu_buffer->read_stamp = delta;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
return;
case RINGBUF_TYPE_DATA:
cpu_buffer->read_stamp += event->time_delta;
return;
default:
RB_WARN_ON(cpu_buffer, 1);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
}
static void
rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
struct ring_buffer_event *event)
{
u64 delta;
switch (event->type_len) {
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
case RINGBUF_TYPE_PADDING:
return;
case RINGBUF_TYPE_TIME_EXTEND:
delta = rb_event_time_stamp(event);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
iter->read_stamp += delta;
return;
case RINGBUF_TYPE_TIME_STAMP:
delta = rb_event_time_stamp(event);
delta = rb_fix_abs_ts(delta, iter->read_stamp);
iter->read_stamp = delta;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
return;
case RINGBUF_TYPE_DATA:
iter->read_stamp += event->time_delta;
return;
default:
RB_WARN_ON(iter->cpu_buffer, 1);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
}
static struct buffer_page *
rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
struct buffer_page *reader = NULL;
ring-buffer: Add place holder recording of dropped events Currently, when the ring buffer drops events, it does not record the fact that it did so. It does inform the writer that the event was dropped by returning a NULL event, but it does not put in any place holder where the event was dropped. This is not a trivial thing to add because the ring buffer mostly runs in overwrite (flight recorder) mode. That is, when the ring buffer is full, new data will overwrite old data. In a produce/consumer mode, where new data is simply dropped when the ring buffer is full, it is trivial to add the placeholder for dropped events. When there's more room to write new data, then a special event can be added to notify the reader about the dropped events. But in overwrite mode, any new write can overwrite events. A place holder can not be inserted into the ring buffer since there never may be room. A reader could also come in at anytime and miss the placeholder. Luckily, the way the ring buffer works, the read side can find out if events were lost or not, and how many events. Everytime a write takes place, if it overwrites the header page (the next read) it updates a "overrun" variable that keeps track of the number of lost events. When a reader swaps out a page from the ring buffer, it can record this number, perfom the swap, and then check to see if the number changed, and take the diff if it has, which would be the number of events dropped. This can be stored by the reader and returned to callers of the reader. Since the reader page swap will fail if the writer moved the head page since the time the reader page set up the swap, this gives room to record the overruns without worrying about races. If the reader sets up the pages, records the overrun, than performs the swap, if the swap succeeds, then the overrun variable has not been updated since the setup before the swap. For binary readers of the ring buffer, a flag is set in the header of each sub page (sub buffer) of the ring buffer. This flag is embedded in the size field of the data on the sub buffer, in the 31st bit (the size can be 32 or 64 bits depending on the architecture), but only 27 bits needs to be used for the actual size (less actually). We could add a new field in the sub buffer header to also record the number of events dropped since the last read, but this will change the format of the binary ring buffer a bit too much. Perhaps this change can be made if the information on the number of events dropped is considered important enough. Note, the notification of dropped events is only used by consuming reads or peeking at the ring buffer. Iterating over the ring buffer does not keep this information because the necessary data is only available when a page swap is made, and the iterator does not swap out pages. Cc: Robert Richter <robert.richter@amd.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: "Luis Claudio R. Goncalves" <lclaudio@uudg.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-04-01 01:21:56 +08:00
unsigned long overwrite;
unsigned long flags;
int nr_loops = 0;
bool ret;
local_irq_save(flags);
arch_spin_lock(&cpu_buffer->lock);
again:
/*
* This should normally only loop twice. But because the
* start of the reader inserts an empty page, it causes
* a case where we will loop three times. There should be no
* reason to loop four times (that I know of).
*/
if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) {
reader = NULL;
goto out;
}
reader = cpu_buffer->reader_page;
/* If there's more to read, return this page */
if (cpu_buffer->reader_page->read < rb_page_size(reader))
goto out;
/* Never should we have an index greater than the size */
if (RB_WARN_ON(cpu_buffer,
cpu_buffer->reader_page->read > rb_page_size(reader)))
goto out;
/* check if we caught up to the tail */
reader = NULL;
if (cpu_buffer->commit_page == cpu_buffer->reader_page)
goto out;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
ring-buffer: Fix uninitialized read_stamp The ring buffer reader page is used to swap a page from the writable ring buffer. If the writer happens to be on that page, it ends up on the reader page, but will simply move off of it, back into the writable ring buffer as writes are added. The time stamp passed back to the readers is stored in the cpu_buffer per CPU descriptor. This stamp is updated when a swap of the reader page takes place, and it reads the current stamp from the page taken from the writable ring buffer. Everytime a writer goes to a new page, it updates the time stamp of that page. The problem happens if a reader reads a page from an empty per CPU ring buffer. If the buffer is empty, the swap still takes place, placing the writer at the start of the reader page. If at a later time, a write happens, it updates the page's time stamp and continues. But the problem is that the read_stamp does not get updated, because the page was already swapped. The solution to this was to not swap the page if the ring buffer happens to be empty. This also removes the side effect that the writes on the reader page will not get updated because the writer never gets back on the reader page without a swap. That is, if a read happens on an empty buffer, but then no reads happen for a while. If a swap took place, and the writer were to start writing a lot of data (function tracer), it will start overflowing the ring buffer and overwrite the older data. But because the writer never goes back onto the reader page, the data left on the reader page never gets overwritten. This causes the reader to see really old data, followed by a jump to newer data. Link: http://lkml.kernel.org/r/1340060577-9112-1-git-send-email-dhsharp@google.com Google-Bug-Id: 6410455 Reported-by: David Sharp <dhsharp@google.com> tested-by: David Sharp <dhsharp@google.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2012-06-29 01:35:04 +08:00
/* Don't bother swapping if the ring buffer is empty */
if (rb_num_of_entries(cpu_buffer) == 0)
goto out;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/*
* Reset the reader page to size zero.
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
*/
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
local_set(&cpu_buffer->reader_page->write, 0);
local_set(&cpu_buffer->reader_page->entries, 0);
local_set(&cpu_buffer->reader_page->page->commit, 0);
cpu_buffer->reader_page->real_end = 0;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
spin:
/*
* Splice the empty reader page into the list around the head.
*/
reader = rb_set_head_page(cpu_buffer);
if (!reader)
goto out;
cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next);
cpu_buffer->reader_page->list.prev = reader->list.prev;
/*
* cpu_buffer->pages just needs to point to the buffer, it
* has no specific buffer page to point to. Lets move it out
* of our way so we don't accidentally swap it.
*/
cpu_buffer->pages = reader->list.prev;
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
/* The reader page will be pointing to the new head */
rb_set_list_to_head(&cpu_buffer->reader_page->list);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
ring-buffer: Add place holder recording of dropped events Currently, when the ring buffer drops events, it does not record the fact that it did so. It does inform the writer that the event was dropped by returning a NULL event, but it does not put in any place holder where the event was dropped. This is not a trivial thing to add because the ring buffer mostly runs in overwrite (flight recorder) mode. That is, when the ring buffer is full, new data will overwrite old data. In a produce/consumer mode, where new data is simply dropped when the ring buffer is full, it is trivial to add the placeholder for dropped events. When there's more room to write new data, then a special event can be added to notify the reader about the dropped events. But in overwrite mode, any new write can overwrite events. A place holder can not be inserted into the ring buffer since there never may be room. A reader could also come in at anytime and miss the placeholder. Luckily, the way the ring buffer works, the read side can find out if events were lost or not, and how many events. Everytime a write takes place, if it overwrites the header page (the next read) it updates a "overrun" variable that keeps track of the number of lost events. When a reader swaps out a page from the ring buffer, it can record this number, perfom the swap, and then check to see if the number changed, and take the diff if it has, which would be the number of events dropped. This can be stored by the reader and returned to callers of the reader. Since the reader page swap will fail if the writer moved the head page since the time the reader page set up the swap, this gives room to record the overruns without worrying about races. If the reader sets up the pages, records the overrun, than performs the swap, if the swap succeeds, then the overrun variable has not been updated since the setup before the swap. For binary readers of the ring buffer, a flag is set in the header of each sub page (sub buffer) of the ring buffer. This flag is embedded in the size field of the data on the sub buffer, in the 31st bit (the size can be 32 or 64 bits depending on the architecture), but only 27 bits needs to be used for the actual size (less actually). We could add a new field in the sub buffer header to also record the number of events dropped since the last read, but this will change the format of the binary ring buffer a bit too much. Perhaps this change can be made if the information on the number of events dropped is considered important enough. Note, the notification of dropped events is only used by consuming reads or peeking at the ring buffer. Iterating over the ring buffer does not keep this information because the necessary data is only available when a page swap is made, and the iterator does not swap out pages. Cc: Robert Richter <robert.richter@amd.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: "Luis Claudio R. Goncalves" <lclaudio@uudg.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-04-01 01:21:56 +08:00
/*
* We want to make sure we read the overruns after we set up our
* pointers to the next object. The writer side does a
* cmpxchg to cross pages which acts as the mb on the writer
* side. Note, the reader will constantly fail the swap
* while the writer is updating the pointers, so this
* guarantees that the overwrite recorded here is the one we
* want to compare with the last_overrun.
*/
smp_mb();
overwrite = local_read(&(cpu_buffer->overrun));
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
/*
* Here's the tricky part.
*
* We need to move the pointer past the header page.
* But we can only do that if a writer is not currently
* moving it. The page before the header page has the
* flag bit '1' set if it is pointing to the page we want.
* but if the writer is in the process of moving it
* than it will be '2' or already moved '0'.
*/
ret = rb_head_page_replace(reader, cpu_buffer->reader_page);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/*
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
* If we did not convert it, then we must try again.
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
*/
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
if (!ret)
goto spin;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
/*
* Yay! We succeeded in replacing the page.
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
*
* Now make the new head point back to the reader page.
*/
rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list;
rb_inc_page(&cpu_buffer->head_page);
local_inc(&cpu_buffer->pages_read);
/* Finally update the reader page to the new head */
cpu_buffer->reader_page = reader;
cpu_buffer->reader_page->read = 0;
ring-buffer: Add place holder recording of dropped events Currently, when the ring buffer drops events, it does not record the fact that it did so. It does inform the writer that the event was dropped by returning a NULL event, but it does not put in any place holder where the event was dropped. This is not a trivial thing to add because the ring buffer mostly runs in overwrite (flight recorder) mode. That is, when the ring buffer is full, new data will overwrite old data. In a produce/consumer mode, where new data is simply dropped when the ring buffer is full, it is trivial to add the placeholder for dropped events. When there's more room to write new data, then a special event can be added to notify the reader about the dropped events. But in overwrite mode, any new write can overwrite events. A place holder can not be inserted into the ring buffer since there never may be room. A reader could also come in at anytime and miss the placeholder. Luckily, the way the ring buffer works, the read side can find out if events were lost or not, and how many events. Everytime a write takes place, if it overwrites the header page (the next read) it updates a "overrun" variable that keeps track of the number of lost events. When a reader swaps out a page from the ring buffer, it can record this number, perfom the swap, and then check to see if the number changed, and take the diff if it has, which would be the number of events dropped. This can be stored by the reader and returned to callers of the reader. Since the reader page swap will fail if the writer moved the head page since the time the reader page set up the swap, this gives room to record the overruns without worrying about races. If the reader sets up the pages, records the overrun, than performs the swap, if the swap succeeds, then the overrun variable has not been updated since the setup before the swap. For binary readers of the ring buffer, a flag is set in the header of each sub page (sub buffer) of the ring buffer. This flag is embedded in the size field of the data on the sub buffer, in the 31st bit (the size can be 32 or 64 bits depending on the architecture), but only 27 bits needs to be used for the actual size (less actually). We could add a new field in the sub buffer header to also record the number of events dropped since the last read, but this will change the format of the binary ring buffer a bit too much. Perhaps this change can be made if the information on the number of events dropped is considered important enough. Note, the notification of dropped events is only used by consuming reads or peeking at the ring buffer. Iterating over the ring buffer does not keep this information because the necessary data is only available when a page swap is made, and the iterator does not swap out pages. Cc: Robert Richter <robert.richter@amd.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: "Luis Claudio R. Goncalves" <lclaudio@uudg.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-04-01 01:21:56 +08:00
if (overwrite != cpu_buffer->last_overrun) {
cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun;
cpu_buffer->last_overrun = overwrite;
}
goto again;
out:
/* Update the read_stamp on the first event */
if (reader && reader->read == 0)
cpu_buffer->read_stamp = reader->page->time_stamp;
arch_spin_unlock(&cpu_buffer->lock);
local_irq_restore(flags);
ring-buffer: Fix race between reset page and reading page The ring buffer is broken up into sub buffers (currently of page size). Each sub buffer has a pointer to its "tail" (the last event written to the sub buffer). When a new event is requested, the tail is locally incremented to cover the size of the new event. This is done in a way that there is no need for locking. If the tail goes past the end of the sub buffer, the process of moving to the next sub buffer takes place. After setting the current sub buffer to the next one, the previous one that had the tail go passed the end of the sub buffer needs to be reset back to the original tail location (before the new event was requested) and the rest of the sub buffer needs to be "padded". The race happens when a reader takes control of the sub buffer. As readers do a "swap" of sub buffers from the ring buffer to get exclusive access to the sub buffer, it replaces the "head" sub buffer with an empty sub buffer that goes back into the writable portion of the ring buffer. This swap can happen as soon as the writer moves to the next sub buffer and before it updates the last sub buffer with padding. Because the sub buffer can be released to the reader while the writer is still updating the padding, it is possible for the reader to see the event that goes past the end of the sub buffer. This can cause obvious issues. To fix this, add a few memory barriers so that the reader definitely sees the updates to the sub buffer, and also waits until the writer has put back the "tail" of the sub buffer back to the last event that was written on it. To be paranoid, it will only spin for 1 second, otherwise it will warn and shutdown the ring buffer code. 1 second should be enough as the writer does have preemption disabled. If the writer doesn't move within 1 second (with preemption disabled) something is horribly wrong. No interrupt should last 1 second! Link: https://lore.kernel.org/all/20220830120854.7545-1-jiazi.li@transsion.com/ Link: https://bugzilla.kernel.org/show_bug.cgi?id=216369 Link: https://lkml.kernel.org/r/20220929104909.0650a36c@gandalf.local.home Cc: Ingo Molnar <mingo@kernel.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: stable@vger.kernel.org Fixes: c7b0930857e22 ("ring-buffer: prevent adding write in discarded area") Reported-by: Jiazi.Li <jiazi.li@transsion.com> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2022-09-29 22:49:09 +08:00
/*
* The writer has preempt disable, wait for it. But not forever
* Although, 1 second is pretty much "forever"
*/
#define USECS_WAIT 1000000
for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) {
/* If the write is past the end of page, a writer is still updating it */
if (likely(!reader || rb_page_write(reader) <= BUF_PAGE_SIZE))
break;
udelay(1);
/* Get the latest version of the reader write value */
smp_rmb();
}
/* The writer is not moving forward? Something is wrong */
if (RB_WARN_ON(cpu_buffer, nr_loops == USECS_WAIT))
reader = NULL;
/*
* Make sure we see any padding after the write update
ring-buffer: Fix race while reader and writer are on the same page When user reads file 'trace_pipe', kernel keeps printing following logs that warn at "cpu_buffer->reader_page->read > rb_page_size(reader)" in rb_get_reader_page(). It just looks like there's an infinite loop in tracing_read_pipe(). This problem occurs several times on arm64 platform when testing v5.10 and below. Call trace: rb_get_reader_page+0x248/0x1300 rb_buffer_peek+0x34/0x160 ring_buffer_peek+0xbc/0x224 peek_next_entry+0x98/0xbc __find_next_entry+0xc4/0x1c0 trace_find_next_entry_inc+0x30/0x94 tracing_read_pipe+0x198/0x304 vfs_read+0xb4/0x1e0 ksys_read+0x74/0x100 __arm64_sys_read+0x24/0x30 el0_svc_common.constprop.0+0x7c/0x1bc do_el0_svc+0x2c/0x94 el0_svc+0x20/0x30 el0_sync_handler+0xb0/0xb4 el0_sync+0x160/0x180 Then I dump the vmcore and look into the problematic per_cpu ring_buffer, I found that tail_page/commit_page/reader_page are on the same page while reader_page->read is obviously abnormal: tail_page == commit_page == reader_page == { .write = 0x100d20, .read = 0x8f9f4805, // Far greater than 0xd20, obviously abnormal!!! .entries = 0x10004c, .real_end = 0x0, .page = { .time_stamp = 0x857257416af0, .commit = 0xd20, // This page hasn't been full filled. // .data[0...0xd20] seems normal. } } The root cause is most likely the race that reader and writer are on the same page while reader saw an event that not fully committed by writer. To fix this, add memory barriers to make sure the reader can see the content of what is committed. Since commit a0fcaaed0c46 ("ring-buffer: Fix race between reset page and reading page") has added the read barrier in rb_get_reader_page(), here we just need to add the write barrier. Link: https://lore.kernel.org/linux-trace-kernel/20230325021247.2923907-1-zhengyejian1@huawei.com Cc: stable@vger.kernel.org Fixes: 77ae365eca89 ("ring-buffer: make lockless") Suggested-by: Steven Rostedt (Google) <rostedt@goodmis.org> Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-03-25 10:12:47 +08:00
* (see rb_reset_tail()).
*
* In addition, a writer may be writing on the reader page
* if the page has not been fully filled, so the read barrier
* is also needed to make sure we see the content of what is
* committed by the writer (see rb_set_commit_to_write()).
ring-buffer: Fix race between reset page and reading page The ring buffer is broken up into sub buffers (currently of page size). Each sub buffer has a pointer to its "tail" (the last event written to the sub buffer). When a new event is requested, the tail is locally incremented to cover the size of the new event. This is done in a way that there is no need for locking. If the tail goes past the end of the sub buffer, the process of moving to the next sub buffer takes place. After setting the current sub buffer to the next one, the previous one that had the tail go passed the end of the sub buffer needs to be reset back to the original tail location (before the new event was requested) and the rest of the sub buffer needs to be "padded". The race happens when a reader takes control of the sub buffer. As readers do a "swap" of sub buffers from the ring buffer to get exclusive access to the sub buffer, it replaces the "head" sub buffer with an empty sub buffer that goes back into the writable portion of the ring buffer. This swap can happen as soon as the writer moves to the next sub buffer and before it updates the last sub buffer with padding. Because the sub buffer can be released to the reader while the writer is still updating the padding, it is possible for the reader to see the event that goes past the end of the sub buffer. This can cause obvious issues. To fix this, add a few memory barriers so that the reader definitely sees the updates to the sub buffer, and also waits until the writer has put back the "tail" of the sub buffer back to the last event that was written on it. To be paranoid, it will only spin for 1 second, otherwise it will warn and shutdown the ring buffer code. 1 second should be enough as the writer does have preemption disabled. If the writer doesn't move within 1 second (with preemption disabled) something is horribly wrong. No interrupt should last 1 second! Link: https://lore.kernel.org/all/20220830120854.7545-1-jiazi.li@transsion.com/ Link: https://bugzilla.kernel.org/show_bug.cgi?id=216369 Link: https://lkml.kernel.org/r/20220929104909.0650a36c@gandalf.local.home Cc: Ingo Molnar <mingo@kernel.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: stable@vger.kernel.org Fixes: c7b0930857e22 ("ring-buffer: prevent adding write in discarded area") Reported-by: Jiazi.Li <jiazi.li@transsion.com> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2022-09-29 22:49:09 +08:00
*/
smp_rmb();
return reader;
}
static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
{
struct ring_buffer_event *event;
struct buffer_page *reader;
unsigned length;
reader = rb_get_reader_page(cpu_buffer);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/* This function should not be called when buffer is empty */
if (RB_WARN_ON(cpu_buffer, !reader))
return;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
event = rb_reader_event(cpu_buffer);
if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
cpu_buffer->read++;
rb_update_read_stamp(cpu_buffer, event);
length = rb_event_length(event);
cpu_buffer->reader_page->read += length;
cpu_buffer->read_bytes += length;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
static void rb_advance_iter(struct ring_buffer_iter *iter)
{
struct ring_buffer_per_cpu *cpu_buffer;
cpu_buffer = iter->cpu_buffer;
/* If head == next_event then we need to jump to the next event */
if (iter->head == iter->next_event) {
/* If the event gets overwritten again, there's nothing to do */
if (rb_iter_head_event(iter) == NULL)
return;
}
iter->head = iter->next_event;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/*
* Check if we are at the end of the buffer.
*/
if (iter->next_event >= rb_page_size(iter->head_page)) {
/* discarded commits can make the page empty */
if (iter->head_page == cpu_buffer->commit_page)
return;
rb_inc_iter(iter);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
return;
}
rb_update_iter_read_stamp(iter, iter->event);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
ring-buffer: Add place holder recording of dropped events Currently, when the ring buffer drops events, it does not record the fact that it did so. It does inform the writer that the event was dropped by returning a NULL event, but it does not put in any place holder where the event was dropped. This is not a trivial thing to add because the ring buffer mostly runs in overwrite (flight recorder) mode. That is, when the ring buffer is full, new data will overwrite old data. In a produce/consumer mode, where new data is simply dropped when the ring buffer is full, it is trivial to add the placeholder for dropped events. When there's more room to write new data, then a special event can be added to notify the reader about the dropped events. But in overwrite mode, any new write can overwrite events. A place holder can not be inserted into the ring buffer since there never may be room. A reader could also come in at anytime and miss the placeholder. Luckily, the way the ring buffer works, the read side can find out if events were lost or not, and how many events. Everytime a write takes place, if it overwrites the header page (the next read) it updates a "overrun" variable that keeps track of the number of lost events. When a reader swaps out a page from the ring buffer, it can record this number, perfom the swap, and then check to see if the number changed, and take the diff if it has, which would be the number of events dropped. This can be stored by the reader and returned to callers of the reader. Since the reader page swap will fail if the writer moved the head page since the time the reader page set up the swap, this gives room to record the overruns without worrying about races. If the reader sets up the pages, records the overrun, than performs the swap, if the swap succeeds, then the overrun variable has not been updated since the setup before the swap. For binary readers of the ring buffer, a flag is set in the header of each sub page (sub buffer) of the ring buffer. This flag is embedded in the size field of the data on the sub buffer, in the 31st bit (the size can be 32 or 64 bits depending on the architecture), but only 27 bits needs to be used for the actual size (less actually). We could add a new field in the sub buffer header to also record the number of events dropped since the last read, but this will change the format of the binary ring buffer a bit too much. Perhaps this change can be made if the information on the number of events dropped is considered important enough. Note, the notification of dropped events is only used by consuming reads or peeking at the ring buffer. Iterating over the ring buffer does not keep this information because the necessary data is only available when a page swap is made, and the iterator does not swap out pages. Cc: Robert Richter <robert.richter@amd.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: "Luis Claudio R. Goncalves" <lclaudio@uudg.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-04-01 01:21:56 +08:00
static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
{
return cpu_buffer->lost_events;
}
static struct ring_buffer_event *
ring-buffer: Add place holder recording of dropped events Currently, when the ring buffer drops events, it does not record the fact that it did so. It does inform the writer that the event was dropped by returning a NULL event, but it does not put in any place holder where the event was dropped. This is not a trivial thing to add because the ring buffer mostly runs in overwrite (flight recorder) mode. That is, when the ring buffer is full, new data will overwrite old data. In a produce/consumer mode, where new data is simply dropped when the ring buffer is full, it is trivial to add the placeholder for dropped events. When there's more room to write new data, then a special event can be added to notify the reader about the dropped events. But in overwrite mode, any new write can overwrite events. A place holder can not be inserted into the ring buffer since there never may be room. A reader could also come in at anytime and miss the placeholder. Luckily, the way the ring buffer works, the read side can find out if events were lost or not, and how many events. Everytime a write takes place, if it overwrites the header page (the next read) it updates a "overrun" variable that keeps track of the number of lost events. When a reader swaps out a page from the ring buffer, it can record this number, perfom the swap, and then check to see if the number changed, and take the diff if it has, which would be the number of events dropped. This can be stored by the reader and returned to callers of the reader. Since the reader page swap will fail if the writer moved the head page since the time the reader page set up the swap, this gives room to record the overruns without worrying about races. If the reader sets up the pages, records the overrun, than performs the swap, if the swap succeeds, then the overrun variable has not been updated since the setup before the swap. For binary readers of the ring buffer, a flag is set in the header of each sub page (sub buffer) of the ring buffer. This flag is embedded in the size field of the data on the sub buffer, in the 31st bit (the size can be 32 or 64 bits depending on the architecture), but only 27 bits needs to be used for the actual size (less actually). We could add a new field in the sub buffer header to also record the number of events dropped since the last read, but this will change the format of the binary ring buffer a bit too much. Perhaps this change can be made if the information on the number of events dropped is considered important enough. Note, the notification of dropped events is only used by consuming reads or peeking at the ring buffer. Iterating over the ring buffer does not keep this information because the necessary data is only available when a page swap is made, and the iterator does not swap out pages. Cc: Robert Richter <robert.richter@amd.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: "Luis Claudio R. Goncalves" <lclaudio@uudg.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-04-01 01:21:56 +08:00
rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
unsigned long *lost_events)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
struct ring_buffer_event *event;
struct buffer_page *reader;
int nr_loops = 0;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
if (ts)
*ts = 0;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
again:
/*
ring-buffer: Bind time extend and data events together When the time between two timestamps is greater than 2^27 nanosecs (~134 ms) a time extend event is added that extends the time difference to 59 bits (~18 years). This is due to events only having a 27 bit field to store time. Currently this time extend is a separate event. We add it just before the event data that is being written to the buffer. But before the event data is committed, the event data can also be discarded (as with the case of filters). But because the time extend has already been committed, it will stay in the buffer. If lots of events are being filtered and no event is being written, then every 134ms a time extend can be added to the buffer without any data attached. To keep from filling the entire buffer with time extends, a time extend will never be the first event in a page because the page timestamp can be used. Time extends can only fill the rest of a page with some data at the beginning. This patch binds the time extend with the data. The difference here is that the time extend is not committed before the data is added. Instead, when a time extend is needed, the space reserved on the ring buffer is the time extend + the data event size. The time extend is added to the first part of the reserved block and the data is added to the second. The time extend event is passed back to the reserver, but since the reserver also uses a function to find the data portion of the reserved block, no changes to the ring buffer interface need to be made. When a commit is discarded, we now remove both the time extend and the event. With this approach no more than one time extend can be in the buffer in a row. Data must always follow a time extend. Thanks to Mathieu Desnoyers for suggesting this idea. Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-10-08 06:18:05 +08:00
* We repeat when a time extend is encountered.
* Since the time extend is always attached to a data event,
* we should never loop more than once.
* (We never hit the following condition more than twice).
*/
ring-buffer: Bind time extend and data events together When the time between two timestamps is greater than 2^27 nanosecs (~134 ms) a time extend event is added that extends the time difference to 59 bits (~18 years). This is due to events only having a 27 bit field to store time. Currently this time extend is a separate event. We add it just before the event data that is being written to the buffer. But before the event data is committed, the event data can also be discarded (as with the case of filters). But because the time extend has already been committed, it will stay in the buffer. If lots of events are being filtered and no event is being written, then every 134ms a time extend can be added to the buffer without any data attached. To keep from filling the entire buffer with time extends, a time extend will never be the first event in a page because the page timestamp can be used. Time extends can only fill the rest of a page with some data at the beginning. This patch binds the time extend with the data. The difference here is that the time extend is not committed before the data is added. Instead, when a time extend is needed, the space reserved on the ring buffer is the time extend + the data event size. The time extend is added to the first part of the reserved block and the data is added to the second. The time extend event is passed back to the reserver, but since the reserver also uses a function to find the data portion of the reserved block, no changes to the ring buffer interface need to be made. When a commit is discarded, we now remove both the time extend and the event. With this approach no more than one time extend can be in the buffer in a row. Data must always follow a time extend. Thanks to Mathieu Desnoyers for suggesting this idea. Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-10-08 06:18:05 +08:00
if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
return NULL;
reader = rb_get_reader_page(cpu_buffer);
if (!reader)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
return NULL;
event = rb_reader_event(cpu_buffer);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
switch (event->type_len) {
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
case RINGBUF_TYPE_PADDING:
if (rb_null_event(event))
RB_WARN_ON(cpu_buffer, 1);
/*
* Because the writer could be discarding every
* event it creates (which would probably be bad)
* if we were to go back to "again" then we may never
* catch up, and will trigger the warn on, or lock
* the box. Return the padding, and we will release
* the current locks, and try again.
*/
return event;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
case RINGBUF_TYPE_TIME_EXTEND:
/* Internal data, OK to advance */
rb_advance_reader(cpu_buffer);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
goto again;
case RINGBUF_TYPE_TIME_STAMP:
if (ts) {
*ts = rb_event_time_stamp(event);
*ts = rb_fix_abs_ts(*ts, reader->page->time_stamp);
ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
cpu_buffer->cpu, ts);
}
/* Internal data, OK to advance */
rb_advance_reader(cpu_buffer);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
goto again;
case RINGBUF_TYPE_DATA:
if (ts && !(*ts)) {
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
*ts = cpu_buffer->read_stamp + event->time_delta;
ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
cpu_buffer->cpu, ts);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
ring-buffer: Add place holder recording of dropped events Currently, when the ring buffer drops events, it does not record the fact that it did so. It does inform the writer that the event was dropped by returning a NULL event, but it does not put in any place holder where the event was dropped. This is not a trivial thing to add because the ring buffer mostly runs in overwrite (flight recorder) mode. That is, when the ring buffer is full, new data will overwrite old data. In a produce/consumer mode, where new data is simply dropped when the ring buffer is full, it is trivial to add the placeholder for dropped events. When there's more room to write new data, then a special event can be added to notify the reader about the dropped events. But in overwrite mode, any new write can overwrite events. A place holder can not be inserted into the ring buffer since there never may be room. A reader could also come in at anytime and miss the placeholder. Luckily, the way the ring buffer works, the read side can find out if events were lost or not, and how many events. Everytime a write takes place, if it overwrites the header page (the next read) it updates a "overrun" variable that keeps track of the number of lost events. When a reader swaps out a page from the ring buffer, it can record this number, perfom the swap, and then check to see if the number changed, and take the diff if it has, which would be the number of events dropped. This can be stored by the reader and returned to callers of the reader. Since the reader page swap will fail if the writer moved the head page since the time the reader page set up the swap, this gives room to record the overruns without worrying about races. If the reader sets up the pages, records the overrun, than performs the swap, if the swap succeeds, then the overrun variable has not been updated since the setup before the swap. For binary readers of the ring buffer, a flag is set in the header of each sub page (sub buffer) of the ring buffer. This flag is embedded in the size field of the data on the sub buffer, in the 31st bit (the size can be 32 or 64 bits depending on the architecture), but only 27 bits needs to be used for the actual size (less actually). We could add a new field in the sub buffer header to also record the number of events dropped since the last read, but this will change the format of the binary ring buffer a bit too much. Perhaps this change can be made if the information on the number of events dropped is considered important enough. Note, the notification of dropped events is only used by consuming reads or peeking at the ring buffer. Iterating over the ring buffer does not keep this information because the necessary data is only available when a page swap is made, and the iterator does not swap out pages. Cc: Robert Richter <robert.richter@amd.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: "Luis Claudio R. Goncalves" <lclaudio@uudg.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-04-01 01:21:56 +08:00
if (lost_events)
*lost_events = rb_lost_events(cpu_buffer);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
return event;
default:
RB_WARN_ON(cpu_buffer, 1);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
return NULL;
}
EXPORT_SYMBOL_GPL(ring_buffer_peek);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
static struct ring_buffer_event *
rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
struct trace_buffer *buffer;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event;
int nr_loops = 0;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
if (ts)
*ts = 0;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
cpu_buffer = iter->cpu_buffer;
buffer = cpu_buffer->buffer;
/*
* Check if someone performed a consuming read to the buffer
* or removed some pages from the buffer. In these cases,
* iterator was invalidated and we need to reset it.
*/
if (unlikely(iter->cache_read != cpu_buffer->read ||
iter->cache_reader_page != cpu_buffer->reader_page ||
iter->cache_pages_removed != cpu_buffer->pages_removed))
rb_iter_reset(iter);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
again:
if (ring_buffer_iter_empty(iter))
return NULL;
/*
* As the writer can mess with what the iterator is trying
* to read, just give up if we fail to get an event after
* three tries. The iterator is not as reliable when reading
* the ring buffer with an active write as the consumer is.
* Do not warn if the three failures is reached.
*/
if (++nr_loops > 3)
return NULL;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
if (rb_per_cpu_empty(cpu_buffer))
return NULL;
if (iter->head >= rb_page_size(iter->head_page)) {
rb_inc_iter(iter);
goto again;
}
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
event = rb_iter_head_event(iter);
if (!event)
goto again;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
switch (event->type_len) {
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
case RINGBUF_TYPE_PADDING:
if (rb_null_event(event)) {
rb_inc_iter(iter);
goto again;
}
rb_advance_iter(iter);
return event;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
case RINGBUF_TYPE_TIME_EXTEND:
/* Internal data, OK to advance */
rb_advance_iter(iter);
goto again;
case RINGBUF_TYPE_TIME_STAMP:
if (ts) {
*ts = rb_event_time_stamp(event);
*ts = rb_fix_abs_ts(*ts, iter->head_page->page->time_stamp);
ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
cpu_buffer->cpu, ts);
}
/* Internal data, OK to advance */
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
rb_advance_iter(iter);
goto again;
case RINGBUF_TYPE_DATA:
if (ts && !(*ts)) {
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
*ts = iter->read_stamp + event->time_delta;
ring_buffer_normalize_time_stamp(buffer,
cpu_buffer->cpu, ts);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
return event;
default:
RB_WARN_ON(cpu_buffer, 1);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
return NULL;
}
EXPORT_SYMBOL_GPL(ring_buffer_iter_peek);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
static inline bool rb_reader_lock(struct ring_buffer_per_cpu *cpu_buffer)
{
if (likely(!in_nmi())) {
raw_spin_lock(&cpu_buffer->reader_lock);
return true;
}
/*
* If an NMI die dumps out the content of the ring buffer
* trylock must be used to prevent a deadlock if the NMI
* preempted a task that holds the ring buffer locks. If
* we get the lock then all is fine, if not, then continue
* to do the read, but this can corrupt the ring buffer,
* so it must be permanently disabled from future writes.
* Reading from NMI is a oneshot deal.
*/
if (raw_spin_trylock(&cpu_buffer->reader_lock))
return true;
/* Continue without locking, but disable the ring buffer */
atomic_inc(&cpu_buffer->record_disabled);
return false;
}
static inline void
rb_reader_unlock(struct ring_buffer_per_cpu *cpu_buffer, bool locked)
{
if (likely(locked))
raw_spin_unlock(&cpu_buffer->reader_lock);
}
/**
* ring_buffer_peek - peek at the next event to be read
* @buffer: The ring buffer to read
* @cpu: The cpu to peak at
* @ts: The timestamp counter of this event.
ring-buffer: Add place holder recording of dropped events Currently, when the ring buffer drops events, it does not record the fact that it did so. It does inform the writer that the event was dropped by returning a NULL event, but it does not put in any place holder where the event was dropped. This is not a trivial thing to add because the ring buffer mostly runs in overwrite (flight recorder) mode. That is, when the ring buffer is full, new data will overwrite old data. In a produce/consumer mode, where new data is simply dropped when the ring buffer is full, it is trivial to add the placeholder for dropped events. When there's more room to write new data, then a special event can be added to notify the reader about the dropped events. But in overwrite mode, any new write can overwrite events. A place holder can not be inserted into the ring buffer since there never may be room. A reader could also come in at anytime and miss the placeholder. Luckily, the way the ring buffer works, the read side can find out if events were lost or not, and how many events. Everytime a write takes place, if it overwrites the header page (the next read) it updates a "overrun" variable that keeps track of the number of lost events. When a reader swaps out a page from the ring buffer, it can record this number, perfom the swap, and then check to see if the number changed, and take the diff if it has, which would be the number of events dropped. This can be stored by the reader and returned to callers of the reader. Since the reader page swap will fail if the writer moved the head page since the time the reader page set up the swap, this gives room to record the overruns without worrying about races. If the reader sets up the pages, records the overrun, than performs the swap, if the swap succeeds, then the overrun variable has not been updated since the setup before the swap. For binary readers of the ring buffer, a flag is set in the header of each sub page (sub buffer) of the ring buffer. This flag is embedded in the size field of the data on the sub buffer, in the 31st bit (the size can be 32 or 64 bits depending on the architecture), but only 27 bits needs to be used for the actual size (less actually). We could add a new field in the sub buffer header to also record the number of events dropped since the last read, but this will change the format of the binary ring buffer a bit too much. Perhaps this change can be made if the information on the number of events dropped is considered important enough. Note, the notification of dropped events is only used by consuming reads or peeking at the ring buffer. Iterating over the ring buffer does not keep this information because the necessary data is only available when a page swap is made, and the iterator does not swap out pages. Cc: Robert Richter <robert.richter@amd.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: "Luis Claudio R. Goncalves" <lclaudio@uudg.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-04-01 01:21:56 +08:00
* @lost_events: a variable to store if events were lost (may be NULL)
*
* This will return the event that will be read next, but does
* not consume the data.
*/
struct ring_buffer_event *
ring_buffer_peek(struct trace_buffer *buffer, int cpu, u64 *ts,
ring-buffer: Add place holder recording of dropped events Currently, when the ring buffer drops events, it does not record the fact that it did so. It does inform the writer that the event was dropped by returning a NULL event, but it does not put in any place holder where the event was dropped. This is not a trivial thing to add because the ring buffer mostly runs in overwrite (flight recorder) mode. That is, when the ring buffer is full, new data will overwrite old data. In a produce/consumer mode, where new data is simply dropped when the ring buffer is full, it is trivial to add the placeholder for dropped events. When there's more room to write new data, then a special event can be added to notify the reader about the dropped events. But in overwrite mode, any new write can overwrite events. A place holder can not be inserted into the ring buffer since there never may be room. A reader could also come in at anytime and miss the placeholder. Luckily, the way the ring buffer works, the read side can find out if events were lost or not, and how many events. Everytime a write takes place, if it overwrites the header page (the next read) it updates a "overrun" variable that keeps track of the number of lost events. When a reader swaps out a page from the ring buffer, it can record this number, perfom the swap, and then check to see if the number changed, and take the diff if it has, which would be the number of events dropped. This can be stored by the reader and returned to callers of the reader. Since the reader page swap will fail if the writer moved the head page since the time the reader page set up the swap, this gives room to record the overruns without worrying about races. If the reader sets up the pages, records the overrun, than performs the swap, if the swap succeeds, then the overrun variable has not been updated since the setup before the swap. For binary readers of the ring buffer, a flag is set in the header of each sub page (sub buffer) of the ring buffer. This flag is embedded in the size field of the data on the sub buffer, in the 31st bit (the size can be 32 or 64 bits depending on the architecture), but only 27 bits needs to be used for the actual size (less actually). We could add a new field in the sub buffer header to also record the number of events dropped since the last read, but this will change the format of the binary ring buffer a bit too much. Perhaps this change can be made if the information on the number of events dropped is considered important enough. Note, the notification of dropped events is only used by consuming reads or peeking at the ring buffer. Iterating over the ring buffer does not keep this information because the necessary data is only available when a page swap is made, and the iterator does not swap out pages. Cc: Robert Richter <robert.richter@amd.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: "Luis Claudio R. Goncalves" <lclaudio@uudg.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-04-01 01:21:56 +08:00
unsigned long *lost_events)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
struct ring_buffer_event *event;
unsigned long flags;
bool dolock;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return NULL;
again:
local_irq_save(flags);
dolock = rb_reader_lock(cpu_buffer);
ring-buffer: Add place holder recording of dropped events Currently, when the ring buffer drops events, it does not record the fact that it did so. It does inform the writer that the event was dropped by returning a NULL event, but it does not put in any place holder where the event was dropped. This is not a trivial thing to add because the ring buffer mostly runs in overwrite (flight recorder) mode. That is, when the ring buffer is full, new data will overwrite old data. In a produce/consumer mode, where new data is simply dropped when the ring buffer is full, it is trivial to add the placeholder for dropped events. When there's more room to write new data, then a special event can be added to notify the reader about the dropped events. But in overwrite mode, any new write can overwrite events. A place holder can not be inserted into the ring buffer since there never may be room. A reader could also come in at anytime and miss the placeholder. Luckily, the way the ring buffer works, the read side can find out if events were lost or not, and how many events. Everytime a write takes place, if it overwrites the header page (the next read) it updates a "overrun" variable that keeps track of the number of lost events. When a reader swaps out a page from the ring buffer, it can record this number, perfom the swap, and then check to see if the number changed, and take the diff if it has, which would be the number of events dropped. This can be stored by the reader and returned to callers of the reader. Since the reader page swap will fail if the writer moved the head page since the time the reader page set up the swap, this gives room to record the overruns without worrying about races. If the reader sets up the pages, records the overrun, than performs the swap, if the swap succeeds, then the overrun variable has not been updated since the setup before the swap. For binary readers of the ring buffer, a flag is set in the header of each sub page (sub buffer) of the ring buffer. This flag is embedded in the size field of the data on the sub buffer, in the 31st bit (the size can be 32 or 64 bits depending on the architecture), but only 27 bits needs to be used for the actual size (less actually). We could add a new field in the sub buffer header to also record the number of events dropped since the last read, but this will change the format of the binary ring buffer a bit too much. Perhaps this change can be made if the information on the number of events dropped is considered important enough. Note, the notification of dropped events is only used by consuming reads or peeking at the ring buffer. Iterating over the ring buffer does not keep this information because the necessary data is only available when a page swap is made, and the iterator does not swap out pages. Cc: Robert Richter <robert.richter@amd.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: "Luis Claudio R. Goncalves" <lclaudio@uudg.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-04-01 01:21:56 +08:00
event = rb_buffer_peek(cpu_buffer, ts, lost_events);
ring-buffer: Fix advance of reader in rb_buffer_peek() When calling rb_buffer_peek() from ring_buffer_consume() and a padding event is returned, the function rb_advance_reader() is called twice. This may lead to missing samples or under high workloads to the warning below. This patch fixes this. If a padding event is returned by rb_buffer_peek() it will be consumed by the calling function now. Also, I simplified some code in ring_buffer_consume(). ------------[ cut here ]------------ WARNING: at /dev/shm/.source/linux/kernel/trace/ring_buffer.c:2289 rb_advance_reader+0x2e/0xc5() Hardware name: Anaheim Modules linked in: Pid: 29, comm: events/2 Tainted: G W 2.6.31-rc3-oprofile-x86_64-standard-00059-g5050dc2 #1 Call Trace: [<ffffffff8106776f>] ? rb_advance_reader+0x2e/0xc5 [<ffffffff81039ffe>] warn_slowpath_common+0x77/0x8f [<ffffffff8103a025>] warn_slowpath_null+0xf/0x11 [<ffffffff8106776f>] rb_advance_reader+0x2e/0xc5 [<ffffffff81068bda>] ring_buffer_consume+0xa0/0xd2 [<ffffffff81326933>] op_cpu_buffer_read_entry+0x21/0x9e [<ffffffff810be3af>] ? __find_get_block+0x4b/0x165 [<ffffffff8132749b>] sync_buffer+0xa5/0x401 [<ffffffff810be3af>] ? __find_get_block+0x4b/0x165 [<ffffffff81326c1b>] ? wq_sync_buffer+0x0/0x78 [<ffffffff81326c76>] wq_sync_buffer+0x5b/0x78 [<ffffffff8104aa30>] worker_thread+0x113/0x1ac [<ffffffff8104dd95>] ? autoremove_wake_function+0x0/0x38 [<ffffffff8104a91d>] ? worker_thread+0x0/0x1ac [<ffffffff8104dc9a>] kthread+0x88/0x92 [<ffffffff8100bdba>] child_rip+0xa/0x20 [<ffffffff8104dc12>] ? kthread+0x0/0x92 [<ffffffff8100bdb0>] ? child_rip+0x0/0x20 ---[ end trace f561c0a58fcc89bd ]--- Cc: Steven Rostedt <rostedt@goodmis.org> Cc: <stable@kernel.org> Signed-off-by: Robert Richter <robert.richter@amd.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-07-31 01:19:18 +08:00
if (event && event->type_len == RINGBUF_TYPE_PADDING)
rb_advance_reader(cpu_buffer);
rb_reader_unlock(cpu_buffer, dolock);
local_irq_restore(flags);
if (event && event->type_len == RINGBUF_TYPE_PADDING)
goto again;
return event;
}
/** ring_buffer_iter_dropped - report if there are dropped events
* @iter: The ring buffer iterator
*
* Returns true if there was dropped events since the last peek.
*/
bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter)
{
bool ret = iter->missed_events != 0;
iter->missed_events = 0;
return ret;
}
EXPORT_SYMBOL_GPL(ring_buffer_iter_dropped);
/**
* ring_buffer_iter_peek - peek at the next event to be read
* @iter: The ring buffer iterator
* @ts: The timestamp counter of this event.
*
* This will return the event that will be read next, but does
* not increment the iterator.
*/
struct ring_buffer_event *
ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
{
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
struct ring_buffer_event *event;
unsigned long flags;
again:
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
event = rb_iter_peek(iter, ts);
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
if (event && event->type_len == RINGBUF_TYPE_PADDING)
goto again;
return event;
}
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/**
* ring_buffer_consume - return an event and consume it
* @buffer: The ring buffer to get the next event from
ring-buffer: Add place holder recording of dropped events Currently, when the ring buffer drops events, it does not record the fact that it did so. It does inform the writer that the event was dropped by returning a NULL event, but it does not put in any place holder where the event was dropped. This is not a trivial thing to add because the ring buffer mostly runs in overwrite (flight recorder) mode. That is, when the ring buffer is full, new data will overwrite old data. In a produce/consumer mode, where new data is simply dropped when the ring buffer is full, it is trivial to add the placeholder for dropped events. When there's more room to write new data, then a special event can be added to notify the reader about the dropped events. But in overwrite mode, any new write can overwrite events. A place holder can not be inserted into the ring buffer since there never may be room. A reader could also come in at anytime and miss the placeholder. Luckily, the way the ring buffer works, the read side can find out if events were lost or not, and how many events. Everytime a write takes place, if it overwrites the header page (the next read) it updates a "overrun" variable that keeps track of the number of lost events. When a reader swaps out a page from the ring buffer, it can record this number, perfom the swap, and then check to see if the number changed, and take the diff if it has, which would be the number of events dropped. This can be stored by the reader and returned to callers of the reader. Since the reader page swap will fail if the writer moved the head page since the time the reader page set up the swap, this gives room to record the overruns without worrying about races. If the reader sets up the pages, records the overrun, than performs the swap, if the swap succeeds, then the overrun variable has not been updated since the setup before the swap. For binary readers of the ring buffer, a flag is set in the header of each sub page (sub buffer) of the ring buffer. This flag is embedded in the size field of the data on the sub buffer, in the 31st bit (the size can be 32 or 64 bits depending on the architecture), but only 27 bits needs to be used for the actual size (less actually). We could add a new field in the sub buffer header to also record the number of events dropped since the last read, but this will change the format of the binary ring buffer a bit too much. Perhaps this change can be made if the information on the number of events dropped is considered important enough. Note, the notification of dropped events is only used by consuming reads or peeking at the ring buffer. Iterating over the ring buffer does not keep this information because the necessary data is only available when a page swap is made, and the iterator does not swap out pages. Cc: Robert Richter <robert.richter@amd.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: "Luis Claudio R. Goncalves" <lclaudio@uudg.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-04-01 01:21:56 +08:00
* @cpu: the cpu to read the buffer from
* @ts: a variable to store the timestamp (may be NULL)
* @lost_events: a variable to store if events were lost (may be NULL)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
*
* Returns the next event in the ring buffer, and that event is consumed.
* Meaning, that sequential reads will keep returning a different event,
* and eventually empty the ring buffer if the producer is slower.
*/
struct ring_buffer_event *
ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
ring-buffer: Add place holder recording of dropped events Currently, when the ring buffer drops events, it does not record the fact that it did so. It does inform the writer that the event was dropped by returning a NULL event, but it does not put in any place holder where the event was dropped. This is not a trivial thing to add because the ring buffer mostly runs in overwrite (flight recorder) mode. That is, when the ring buffer is full, new data will overwrite old data. In a produce/consumer mode, where new data is simply dropped when the ring buffer is full, it is trivial to add the placeholder for dropped events. When there's more room to write new data, then a special event can be added to notify the reader about the dropped events. But in overwrite mode, any new write can overwrite events. A place holder can not be inserted into the ring buffer since there never may be room. A reader could also come in at anytime and miss the placeholder. Luckily, the way the ring buffer works, the read side can find out if events were lost or not, and how many events. Everytime a write takes place, if it overwrites the header page (the next read) it updates a "overrun" variable that keeps track of the number of lost events. When a reader swaps out a page from the ring buffer, it can record this number, perfom the swap, and then check to see if the number changed, and take the diff if it has, which would be the number of events dropped. This can be stored by the reader and returned to callers of the reader. Since the reader page swap will fail if the writer moved the head page since the time the reader page set up the swap, this gives room to record the overruns without worrying about races. If the reader sets up the pages, records the overrun, than performs the swap, if the swap succeeds, then the overrun variable has not been updated since the setup before the swap. For binary readers of the ring buffer, a flag is set in the header of each sub page (sub buffer) of the ring buffer. This flag is embedded in the size field of the data on the sub buffer, in the 31st bit (the size can be 32 or 64 bits depending on the architecture), but only 27 bits needs to be used for the actual size (less actually). We could add a new field in the sub buffer header to also record the number of events dropped since the last read, but this will change the format of the binary ring buffer a bit too much. Perhaps this change can be made if the information on the number of events dropped is considered important enough. Note, the notification of dropped events is only used by consuming reads or peeking at the ring buffer. Iterating over the ring buffer does not keep this information because the necessary data is only available when a page swap is made, and the iterator does not swap out pages. Cc: Robert Richter <robert.richter@amd.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: "Luis Claudio R. Goncalves" <lclaudio@uudg.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-04-01 01:21:56 +08:00
unsigned long *lost_events)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event = NULL;
unsigned long flags;
bool dolock;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
again:
/* might be called in atomic */
preempt_disable();
if (!cpumask_test_cpu(cpu, buffer->cpumask))
goto out;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
cpu_buffer = buffer->buffers[cpu];
local_irq_save(flags);
dolock = rb_reader_lock(cpu_buffer);
ring-buffer: Add place holder recording of dropped events Currently, when the ring buffer drops events, it does not record the fact that it did so. It does inform the writer that the event was dropped by returning a NULL event, but it does not put in any place holder where the event was dropped. This is not a trivial thing to add because the ring buffer mostly runs in overwrite (flight recorder) mode. That is, when the ring buffer is full, new data will overwrite old data. In a produce/consumer mode, where new data is simply dropped when the ring buffer is full, it is trivial to add the placeholder for dropped events. When there's more room to write new data, then a special event can be added to notify the reader about the dropped events. But in overwrite mode, any new write can overwrite events. A place holder can not be inserted into the ring buffer since there never may be room. A reader could also come in at anytime and miss the placeholder. Luckily, the way the ring buffer works, the read side can find out if events were lost or not, and how many events. Everytime a write takes place, if it overwrites the header page (the next read) it updates a "overrun" variable that keeps track of the number of lost events. When a reader swaps out a page from the ring buffer, it can record this number, perfom the swap, and then check to see if the number changed, and take the diff if it has, which would be the number of events dropped. This can be stored by the reader and returned to callers of the reader. Since the reader page swap will fail if the writer moved the head page since the time the reader page set up the swap, this gives room to record the overruns without worrying about races. If the reader sets up the pages, records the overrun, than performs the swap, if the swap succeeds, then the overrun variable has not been updated since the setup before the swap. For binary readers of the ring buffer, a flag is set in the header of each sub page (sub buffer) of the ring buffer. This flag is embedded in the size field of the data on the sub buffer, in the 31st bit (the size can be 32 or 64 bits depending on the architecture), but only 27 bits needs to be used for the actual size (less actually). We could add a new field in the sub buffer header to also record the number of events dropped since the last read, but this will change the format of the binary ring buffer a bit too much. Perhaps this change can be made if the information on the number of events dropped is considered important enough. Note, the notification of dropped events is only used by consuming reads or peeking at the ring buffer. Iterating over the ring buffer does not keep this information because the necessary data is only available when a page swap is made, and the iterator does not swap out pages. Cc: Robert Richter <robert.richter@amd.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: "Luis Claudio R. Goncalves" <lclaudio@uudg.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-04-01 01:21:56 +08:00
event = rb_buffer_peek(cpu_buffer, ts, lost_events);
if (event) {
cpu_buffer->lost_events = 0;
ring-buffer: Fix advance of reader in rb_buffer_peek() When calling rb_buffer_peek() from ring_buffer_consume() and a padding event is returned, the function rb_advance_reader() is called twice. This may lead to missing samples or under high workloads to the warning below. This patch fixes this. If a padding event is returned by rb_buffer_peek() it will be consumed by the calling function now. Also, I simplified some code in ring_buffer_consume(). ------------[ cut here ]------------ WARNING: at /dev/shm/.source/linux/kernel/trace/ring_buffer.c:2289 rb_advance_reader+0x2e/0xc5() Hardware name: Anaheim Modules linked in: Pid: 29, comm: events/2 Tainted: G W 2.6.31-rc3-oprofile-x86_64-standard-00059-g5050dc2 #1 Call Trace: [<ffffffff8106776f>] ? rb_advance_reader+0x2e/0xc5 [<ffffffff81039ffe>] warn_slowpath_common+0x77/0x8f [<ffffffff8103a025>] warn_slowpath_null+0xf/0x11 [<ffffffff8106776f>] rb_advance_reader+0x2e/0xc5 [<ffffffff81068bda>] ring_buffer_consume+0xa0/0xd2 [<ffffffff81326933>] op_cpu_buffer_read_entry+0x21/0x9e [<ffffffff810be3af>] ? __find_get_block+0x4b/0x165 [<ffffffff8132749b>] sync_buffer+0xa5/0x401 [<ffffffff810be3af>] ? __find_get_block+0x4b/0x165 [<ffffffff81326c1b>] ? wq_sync_buffer+0x0/0x78 [<ffffffff81326c76>] wq_sync_buffer+0x5b/0x78 [<ffffffff8104aa30>] worker_thread+0x113/0x1ac [<ffffffff8104dd95>] ? autoremove_wake_function+0x0/0x38 [<ffffffff8104a91d>] ? worker_thread+0x0/0x1ac [<ffffffff8104dc9a>] kthread+0x88/0x92 [<ffffffff8100bdba>] child_rip+0xa/0x20 [<ffffffff8104dc12>] ? kthread+0x0/0x92 [<ffffffff8100bdb0>] ? child_rip+0x0/0x20 ---[ end trace f561c0a58fcc89bd ]--- Cc: Steven Rostedt <rostedt@goodmis.org> Cc: <stable@kernel.org> Signed-off-by: Robert Richter <robert.richter@amd.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-07-31 01:19:18 +08:00
rb_advance_reader(cpu_buffer);
ring-buffer: Add place holder recording of dropped events Currently, when the ring buffer drops events, it does not record the fact that it did so. It does inform the writer that the event was dropped by returning a NULL event, but it does not put in any place holder where the event was dropped. This is not a trivial thing to add because the ring buffer mostly runs in overwrite (flight recorder) mode. That is, when the ring buffer is full, new data will overwrite old data. In a produce/consumer mode, where new data is simply dropped when the ring buffer is full, it is trivial to add the placeholder for dropped events. When there's more room to write new data, then a special event can be added to notify the reader about the dropped events. But in overwrite mode, any new write can overwrite events. A place holder can not be inserted into the ring buffer since there never may be room. A reader could also come in at anytime and miss the placeholder. Luckily, the way the ring buffer works, the read side can find out if events were lost or not, and how many events. Everytime a write takes place, if it overwrites the header page (the next read) it updates a "overrun" variable that keeps track of the number of lost events. When a reader swaps out a page from the ring buffer, it can record this number, perfom the swap, and then check to see if the number changed, and take the diff if it has, which would be the number of events dropped. This can be stored by the reader and returned to callers of the reader. Since the reader page swap will fail if the writer moved the head page since the time the reader page set up the swap, this gives room to record the overruns without worrying about races. If the reader sets up the pages, records the overrun, than performs the swap, if the swap succeeds, then the overrun variable has not been updated since the setup before the swap. For binary readers of the ring buffer, a flag is set in the header of each sub page (sub buffer) of the ring buffer. This flag is embedded in the size field of the data on the sub buffer, in the 31st bit (the size can be 32 or 64 bits depending on the architecture), but only 27 bits needs to be used for the actual size (less actually). We could add a new field in the sub buffer header to also record the number of events dropped since the last read, but this will change the format of the binary ring buffer a bit too much. Perhaps this change can be made if the information on the number of events dropped is considered important enough. Note, the notification of dropped events is only used by consuming reads or peeking at the ring buffer. Iterating over the ring buffer does not keep this information because the necessary data is only available when a page swap is made, and the iterator does not swap out pages. Cc: Robert Richter <robert.richter@amd.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: "Luis Claudio R. Goncalves" <lclaudio@uudg.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-04-01 01:21:56 +08:00
}
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
rb_reader_unlock(cpu_buffer, dolock);
local_irq_restore(flags);
out:
preempt_enable();
if (event && event->type_len == RINGBUF_TYPE_PADDING)
goto again;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
return event;
}
EXPORT_SYMBOL_GPL(ring_buffer_consume);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/**
ring-buffer: Make non-consuming read less expensive with lots of cpus. When performing a non-consuming read, a synchronize_sched() is performed once for every cpu which is actively tracing. This is very expensive, and can make it take several seconds to open up the 'trace' file with lots of cpus. Only one synchronize_sched() call is actually necessary. What is desired is for all cpus to see the disabling state change. So we transform the existing sequence: for_each_cpu() { ring_buffer_read_start(); } where each ring_buffer_start() call performs a synchronize_sched(), into the following: for_each_cpu() { ring_buffer_read_prepare(); } ring_buffer_read_prepare_sync(); for_each_cpu() { ring_buffer_read_start(); } wherein only the single ring_buffer_read_prepare_sync() call needs to do the synchronize_sched(). The first phase, via ring_buffer_read_prepare(), allocates the 'iter' memory and increments ->record_disabled. In the second phase, ring_buffer_read_prepare_sync() makes sure this ->record_disabled state is visible fully to all cpus. And in the final third phase, the ring_buffer_read_start() calls reset the 'iter' objects allocated in the first phase since we now know that none of the cpus are adding trace entries any more. This makes openning the 'trace' file nearly instantaneous on a sparc64 Niagara2 box with 128 cpus tracing. Signed-off-by: David S. Miller <davem@davemloft.net> LKML-Reference: <20100420.154711.11246950.davem@davemloft.net> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-04-21 06:47:11 +08:00
* ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
* @buffer: The ring buffer to read from
* @cpu: The cpu buffer to iterate over
tracing: kdb: Fix ftdump to not sleep As reported back in 2016-11 [1], the "ftdump" kdb command triggers a BUG for "sleeping function called from invalid context". kdb's "ftdump" command wants to call ring_buffer_read_prepare() in atomic context. A very simple solution for this is to add allocation flags to ring_buffer_read_prepare() so kdb can call it without triggering the allocation error. This patch does that. Note that in the original email thread about this, it was suggested that perhaps the solution for kdb was to either preallocate the buffer ahead of time or create our own iterator. I'm hoping that this alternative of adding allocation flags to ring_buffer_read_prepare() can be considered since it means I don't need to duplicate more of the core trace code into "trace_kdb.c" (for either creating my own iterator or re-preparing a ring allocator whose memory was already allocated). NOTE: another option for kdb is to actually figure out how to make it reuse the existing ftrace_dump() function and totally eliminate the duplication. This sounds very appealing and actually works (the "sr z" command can be seen to properly dump the ftrace buffer). The downside here is that ftrace_dump() fully consumes the trace buffer. Unless that is changed I'd rather not use it because it means "ftdump | grep xyz" won't be very useful to search the ftrace buffer since it will throw away the whole trace on the first grep. A future patch to dump only the last few lines of the buffer will also be hard to implement. [1] https://lkml.kernel.org/r/20161117191605.GA21459@google.com Link: http://lkml.kernel.org/r/20190308193205.213659-1-dianders@chromium.org Reported-by: Brian Norris <briannorris@chromium.org> Signed-off-by: Douglas Anderson <dianders@chromium.org> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2019-03-09 03:32:04 +08:00
* @flags: gfp flags to use for memory allocation
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
*
ring-buffer: Make non-consuming read less expensive with lots of cpus. When performing a non-consuming read, a synchronize_sched() is performed once for every cpu which is actively tracing. This is very expensive, and can make it take several seconds to open up the 'trace' file with lots of cpus. Only one synchronize_sched() call is actually necessary. What is desired is for all cpus to see the disabling state change. So we transform the existing sequence: for_each_cpu() { ring_buffer_read_start(); } where each ring_buffer_start() call performs a synchronize_sched(), into the following: for_each_cpu() { ring_buffer_read_prepare(); } ring_buffer_read_prepare_sync(); for_each_cpu() { ring_buffer_read_start(); } wherein only the single ring_buffer_read_prepare_sync() call needs to do the synchronize_sched(). The first phase, via ring_buffer_read_prepare(), allocates the 'iter' memory and increments ->record_disabled. In the second phase, ring_buffer_read_prepare_sync() makes sure this ->record_disabled state is visible fully to all cpus. And in the final third phase, the ring_buffer_read_start() calls reset the 'iter' objects allocated in the first phase since we now know that none of the cpus are adding trace entries any more. This makes openning the 'trace' file nearly instantaneous on a sparc64 Niagara2 box with 128 cpus tracing. Signed-off-by: David S. Miller <davem@davemloft.net> LKML-Reference: <20100420.154711.11246950.davem@davemloft.net> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-04-21 06:47:11 +08:00
* This performs the initial preparations necessary to iterate
* through the buffer. Memory is allocated, buffer recording
* is disabled, and the iterator pointer is returned to the caller.
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
*
* Disabling buffer recording prevents the reading from being
ring-buffer: Make non-consuming read less expensive with lots of cpus. When performing a non-consuming read, a synchronize_sched() is performed once for every cpu which is actively tracing. This is very expensive, and can make it take several seconds to open up the 'trace' file with lots of cpus. Only one synchronize_sched() call is actually necessary. What is desired is for all cpus to see the disabling state change. So we transform the existing sequence: for_each_cpu() { ring_buffer_read_start(); } where each ring_buffer_start() call performs a synchronize_sched(), into the following: for_each_cpu() { ring_buffer_read_prepare(); } ring_buffer_read_prepare_sync(); for_each_cpu() { ring_buffer_read_start(); } wherein only the single ring_buffer_read_prepare_sync() call needs to do the synchronize_sched(). The first phase, via ring_buffer_read_prepare(), allocates the 'iter' memory and increments ->record_disabled. In the second phase, ring_buffer_read_prepare_sync() makes sure this ->record_disabled state is visible fully to all cpus. And in the final third phase, the ring_buffer_read_start() calls reset the 'iter' objects allocated in the first phase since we now know that none of the cpus are adding trace entries any more. This makes openning the 'trace' file nearly instantaneous on a sparc64 Niagara2 box with 128 cpus tracing. Signed-off-by: David S. Miller <davem@davemloft.net> LKML-Reference: <20100420.154711.11246950.davem@davemloft.net> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-04-21 06:47:11 +08:00
* corrupted. This is not a consuming read, so a producer is not
* expected.
*
* After a sequence of ring_buffer_read_prepare calls, the user is
* expected to make at least one call to ring_buffer_read_prepare_sync.
ring-buffer: Make non-consuming read less expensive with lots of cpus. When performing a non-consuming read, a synchronize_sched() is performed once for every cpu which is actively tracing. This is very expensive, and can make it take several seconds to open up the 'trace' file with lots of cpus. Only one synchronize_sched() call is actually necessary. What is desired is for all cpus to see the disabling state change. So we transform the existing sequence: for_each_cpu() { ring_buffer_read_start(); } where each ring_buffer_start() call performs a synchronize_sched(), into the following: for_each_cpu() { ring_buffer_read_prepare(); } ring_buffer_read_prepare_sync(); for_each_cpu() { ring_buffer_read_start(); } wherein only the single ring_buffer_read_prepare_sync() call needs to do the synchronize_sched(). The first phase, via ring_buffer_read_prepare(), allocates the 'iter' memory and increments ->record_disabled. In the second phase, ring_buffer_read_prepare_sync() makes sure this ->record_disabled state is visible fully to all cpus. And in the final third phase, the ring_buffer_read_start() calls reset the 'iter' objects allocated in the first phase since we now know that none of the cpus are adding trace entries any more. This makes openning the 'trace' file nearly instantaneous on a sparc64 Niagara2 box with 128 cpus tracing. Signed-off-by: David S. Miller <davem@davemloft.net> LKML-Reference: <20100420.154711.11246950.davem@davemloft.net> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-04-21 06:47:11 +08:00
* Afterwards, ring_buffer_read_start is invoked to get things going
* for real.
*
* This overall must be paired with ring_buffer_read_finish.
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
*/
struct ring_buffer_iter *
ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_iter *iter;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return NULL;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
iter = kzalloc(sizeof(*iter), flags);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
if (!iter)
return NULL;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/* Holds the entire event: data and meta data */
iter->event = kmalloc(BUF_PAGE_SIZE, flags);
if (!iter->event) {
kfree(iter);
return NULL;
}
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
cpu_buffer = buffer->buffers[cpu];
iter->cpu_buffer = cpu_buffer;
atomic_inc(&cpu_buffer->resize_disabled);
ring-buffer: Make non-consuming read less expensive with lots of cpus. When performing a non-consuming read, a synchronize_sched() is performed once for every cpu which is actively tracing. This is very expensive, and can make it take several seconds to open up the 'trace' file with lots of cpus. Only one synchronize_sched() call is actually necessary. What is desired is for all cpus to see the disabling state change. So we transform the existing sequence: for_each_cpu() { ring_buffer_read_start(); } where each ring_buffer_start() call performs a synchronize_sched(), into the following: for_each_cpu() { ring_buffer_read_prepare(); } ring_buffer_read_prepare_sync(); for_each_cpu() { ring_buffer_read_start(); } wherein only the single ring_buffer_read_prepare_sync() call needs to do the synchronize_sched(). The first phase, via ring_buffer_read_prepare(), allocates the 'iter' memory and increments ->record_disabled. In the second phase, ring_buffer_read_prepare_sync() makes sure this ->record_disabled state is visible fully to all cpus. And in the final third phase, the ring_buffer_read_start() calls reset the 'iter' objects allocated in the first phase since we now know that none of the cpus are adding trace entries any more. This makes openning the 'trace' file nearly instantaneous on a sparc64 Niagara2 box with 128 cpus tracing. Signed-off-by: David S. Miller <davem@davemloft.net> LKML-Reference: <20100420.154711.11246950.davem@davemloft.net> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-04-21 06:47:11 +08:00
return iter;
}
EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
/**
* ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
*
* All previously invoked ring_buffer_read_prepare calls to prepare
* iterators will be synchronized. Afterwards, read_buffer_read_start
* calls on those iterators are allowed.
*/
void
ring_buffer_read_prepare_sync(void)
{
synchronize_rcu();
ring-buffer: Make non-consuming read less expensive with lots of cpus. When performing a non-consuming read, a synchronize_sched() is performed once for every cpu which is actively tracing. This is very expensive, and can make it take several seconds to open up the 'trace' file with lots of cpus. Only one synchronize_sched() call is actually necessary. What is desired is for all cpus to see the disabling state change. So we transform the existing sequence: for_each_cpu() { ring_buffer_read_start(); } where each ring_buffer_start() call performs a synchronize_sched(), into the following: for_each_cpu() { ring_buffer_read_prepare(); } ring_buffer_read_prepare_sync(); for_each_cpu() { ring_buffer_read_start(); } wherein only the single ring_buffer_read_prepare_sync() call needs to do the synchronize_sched(). The first phase, via ring_buffer_read_prepare(), allocates the 'iter' memory and increments ->record_disabled. In the second phase, ring_buffer_read_prepare_sync() makes sure this ->record_disabled state is visible fully to all cpus. And in the final third phase, the ring_buffer_read_start() calls reset the 'iter' objects allocated in the first phase since we now know that none of the cpus are adding trace entries any more. This makes openning the 'trace' file nearly instantaneous on a sparc64 Niagara2 box with 128 cpus tracing. Signed-off-by: David S. Miller <davem@davemloft.net> LKML-Reference: <20100420.154711.11246950.davem@davemloft.net> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-04-21 06:47:11 +08:00
}
EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
/**
* ring_buffer_read_start - start a non consuming read of the buffer
* @iter: The iterator returned by ring_buffer_read_prepare
*
* This finalizes the startup of an iteration through the buffer.
* The iterator comes from a call to ring_buffer_read_prepare and
* an intervening ring_buffer_read_prepare_sync must have been
* performed.
*
* Must be paired with ring_buffer_read_finish.
ring-buffer: Make non-consuming read less expensive with lots of cpus. When performing a non-consuming read, a synchronize_sched() is performed once for every cpu which is actively tracing. This is very expensive, and can make it take several seconds to open up the 'trace' file with lots of cpus. Only one synchronize_sched() call is actually necessary. What is desired is for all cpus to see the disabling state change. So we transform the existing sequence: for_each_cpu() { ring_buffer_read_start(); } where each ring_buffer_start() call performs a synchronize_sched(), into the following: for_each_cpu() { ring_buffer_read_prepare(); } ring_buffer_read_prepare_sync(); for_each_cpu() { ring_buffer_read_start(); } wherein only the single ring_buffer_read_prepare_sync() call needs to do the synchronize_sched(). The first phase, via ring_buffer_read_prepare(), allocates the 'iter' memory and increments ->record_disabled. In the second phase, ring_buffer_read_prepare_sync() makes sure this ->record_disabled state is visible fully to all cpus. And in the final third phase, the ring_buffer_read_start() calls reset the 'iter' objects allocated in the first phase since we now know that none of the cpus are adding trace entries any more. This makes openning the 'trace' file nearly instantaneous on a sparc64 Niagara2 box with 128 cpus tracing. Signed-off-by: David S. Miller <davem@davemloft.net> LKML-Reference: <20100420.154711.11246950.davem@davemloft.net> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-04-21 06:47:11 +08:00
*/
void
ring_buffer_read_start(struct ring_buffer_iter *iter)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
if (!iter)
return;
cpu_buffer = iter->cpu_buffer;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
arch_spin_lock(&cpu_buffer->lock);
rb_iter_reset(iter);
arch_spin_unlock(&cpu_buffer->lock);
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
EXPORT_SYMBOL_GPL(ring_buffer_read_start);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/**
* ring_buffer_read_finish - finish reading the iterator of the buffer
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
* @iter: The iterator retrieved by ring_buffer_start
*
* This re-enables the recording to the buffer, and frees the
* iterator.
*/
void
ring_buffer_read_finish(struct ring_buffer_iter *iter)
{
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
unsigned long flags;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/*
* Ring buffer is disabled from recording, here's a good place
* to check the integrity of the ring buffer.
* Must prevent readers from trying to read, as the check
* clears the HEAD page and readers require it.
*/
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
rb_check_pages(cpu_buffer);
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
atomic_dec(&cpu_buffer->resize_disabled);
kfree(iter->event);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
kfree(iter);
}
EXPORT_SYMBOL_GPL(ring_buffer_read_finish);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/**
* ring_buffer_iter_advance - advance the iterator to the next location
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
* @iter: The ring buffer iterator
*
* Move the location of the iterator such that the next read will
* be the next location of the iterator.
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
*/
void ring_buffer_iter_advance(struct ring_buffer_iter *iter)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
unsigned long flags;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
rb_advance_iter(iter);
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
EXPORT_SYMBOL_GPL(ring_buffer_iter_advance);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/**
* ring_buffer_size - return the size of the ring buffer (in bytes)
* @buffer: The ring buffer.
* @cpu: The CPU to get ring buffer size from.
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
*/
unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
/*
* Earlier, this method returned
* BUF_PAGE_SIZE * buffer->nr_pages
* Since the nr_pages field is now removed, we have converted this to
* return the per cpu buffer value.
*/
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return 0;
return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
EXPORT_SYMBOL_GPL(ring_buffer_size);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
ring-buffer: Fix deadloop issue on reading trace_pipe Soft lockup occurs when reading file 'trace_pipe': watchdog: BUG: soft lockup - CPU#6 stuck for 22s! [cat:4488] [...] RIP: 0010:ring_buffer_empty_cpu+0xed/0x170 RSP: 0018:ffff88810dd6fc48 EFLAGS: 00000246 RAX: 0000000000000000 RBX: 0000000000000246 RCX: ffffffff93d1aaeb RDX: ffff88810a280040 RSI: 0000000000000008 RDI: ffff88811164b218 RBP: ffff88811164b218 R08: 0000000000000000 R09: ffff88815156600f R10: ffffed102a2acc01 R11: 0000000000000001 R12: 0000000051651901 R13: 0000000000000000 R14: ffff888115e49500 R15: 0000000000000000 [...] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f8d853c2000 CR3: 000000010dcd8000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __find_next_entry+0x1a8/0x4b0 ? peek_next_entry+0x250/0x250 ? down_write+0xa5/0x120 ? down_write_killable+0x130/0x130 trace_find_next_entry_inc+0x3b/0x1d0 tracing_read_pipe+0x423/0xae0 ? tracing_splice_read_pipe+0xcb0/0xcb0 vfs_read+0x16b/0x490 ksys_read+0x105/0x210 ? __ia32_sys_pwrite64+0x200/0x200 ? switch_fpu_return+0x108/0x220 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x61/0xc6 Through the vmcore, I found it's because in tracing_read_pipe(), ring_buffer_empty_cpu() found some buffer is not empty but then it cannot read anything due to "rb_num_of_entries() == 0" always true, Then it infinitely loop the procedure due to user buffer not been filled, see following code path: tracing_read_pipe() { ... ... waitagain: tracing_wait_pipe() // 1. find non-empty buffer here trace_find_next_entry_inc() // 2. loop here try to find an entry __find_next_entry() ring_buffer_empty_cpu(); // 3. find non-empty buffer peek_next_entry() // 4. but peek always return NULL ring_buffer_peek() rb_buffer_peek() rb_get_reader_page() // 5. because rb_num_of_entries() == 0 always true here // then return NULL // 6. user buffer not been filled so goto 'waitgain' // and eventually leads to an deadloop in kernel!!! } By some analyzing, I found that when resetting ringbuffer, the 'entries' of its pages are not all cleared (see rb_reset_cpu()). Then when reducing the ringbuffer, and if some reduced pages exist dirty 'entries' data, they will be added into 'cpu_buffer->overrun' (see rb_remove_pages()), which cause wrong 'overrun' count and eventually cause the deadloop issue. To fix it, we need to clear every pages in rb_reset_cpu(). Link: https://lore.kernel.org/linux-trace-kernel/20230708225144.3785600-1-zhengyejian1@huawei.com Cc: stable@vger.kernel.org Fixes: a5fb833172eca ("ring-buffer: Fix uninitialized read_stamp") Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-07-09 06:51:44 +08:00
static void rb_clear_buffer_page(struct buffer_page *page)
{
local_set(&page->write, 0);
local_set(&page->entries, 0);
rb_init_page(page->page);
page->read = 0;
}
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
static void
rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
{
ring-buffer: Fix deadloop issue on reading trace_pipe Soft lockup occurs when reading file 'trace_pipe': watchdog: BUG: soft lockup - CPU#6 stuck for 22s! [cat:4488] [...] RIP: 0010:ring_buffer_empty_cpu+0xed/0x170 RSP: 0018:ffff88810dd6fc48 EFLAGS: 00000246 RAX: 0000000000000000 RBX: 0000000000000246 RCX: ffffffff93d1aaeb RDX: ffff88810a280040 RSI: 0000000000000008 RDI: ffff88811164b218 RBP: ffff88811164b218 R08: 0000000000000000 R09: ffff88815156600f R10: ffffed102a2acc01 R11: 0000000000000001 R12: 0000000051651901 R13: 0000000000000000 R14: ffff888115e49500 R15: 0000000000000000 [...] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f8d853c2000 CR3: 000000010dcd8000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __find_next_entry+0x1a8/0x4b0 ? peek_next_entry+0x250/0x250 ? down_write+0xa5/0x120 ? down_write_killable+0x130/0x130 trace_find_next_entry_inc+0x3b/0x1d0 tracing_read_pipe+0x423/0xae0 ? tracing_splice_read_pipe+0xcb0/0xcb0 vfs_read+0x16b/0x490 ksys_read+0x105/0x210 ? __ia32_sys_pwrite64+0x200/0x200 ? switch_fpu_return+0x108/0x220 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x61/0xc6 Through the vmcore, I found it's because in tracing_read_pipe(), ring_buffer_empty_cpu() found some buffer is not empty but then it cannot read anything due to "rb_num_of_entries() == 0" always true, Then it infinitely loop the procedure due to user buffer not been filled, see following code path: tracing_read_pipe() { ... ... waitagain: tracing_wait_pipe() // 1. find non-empty buffer here trace_find_next_entry_inc() // 2. loop here try to find an entry __find_next_entry() ring_buffer_empty_cpu(); // 3. find non-empty buffer peek_next_entry() // 4. but peek always return NULL ring_buffer_peek() rb_buffer_peek() rb_get_reader_page() // 5. because rb_num_of_entries() == 0 always true here // then return NULL // 6. user buffer not been filled so goto 'waitgain' // and eventually leads to an deadloop in kernel!!! } By some analyzing, I found that when resetting ringbuffer, the 'entries' of its pages are not all cleared (see rb_reset_cpu()). Then when reducing the ringbuffer, and if some reduced pages exist dirty 'entries' data, they will be added into 'cpu_buffer->overrun' (see rb_remove_pages()), which cause wrong 'overrun' count and eventually cause the deadloop issue. To fix it, we need to clear every pages in rb_reset_cpu(). Link: https://lore.kernel.org/linux-trace-kernel/20230708225144.3785600-1-zhengyejian1@huawei.com Cc: stable@vger.kernel.org Fixes: a5fb833172eca ("ring-buffer: Fix uninitialized read_stamp") Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-07-09 06:51:44 +08:00
struct buffer_page *page;
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
rb_head_page_deactivate(cpu_buffer);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
cpu_buffer->head_page
= list_entry(cpu_buffer->pages, struct buffer_page, list);
ring-buffer: Fix deadloop issue on reading trace_pipe Soft lockup occurs when reading file 'trace_pipe': watchdog: BUG: soft lockup - CPU#6 stuck for 22s! [cat:4488] [...] RIP: 0010:ring_buffer_empty_cpu+0xed/0x170 RSP: 0018:ffff88810dd6fc48 EFLAGS: 00000246 RAX: 0000000000000000 RBX: 0000000000000246 RCX: ffffffff93d1aaeb RDX: ffff88810a280040 RSI: 0000000000000008 RDI: ffff88811164b218 RBP: ffff88811164b218 R08: 0000000000000000 R09: ffff88815156600f R10: ffffed102a2acc01 R11: 0000000000000001 R12: 0000000051651901 R13: 0000000000000000 R14: ffff888115e49500 R15: 0000000000000000 [...] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f8d853c2000 CR3: 000000010dcd8000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __find_next_entry+0x1a8/0x4b0 ? peek_next_entry+0x250/0x250 ? down_write+0xa5/0x120 ? down_write_killable+0x130/0x130 trace_find_next_entry_inc+0x3b/0x1d0 tracing_read_pipe+0x423/0xae0 ? tracing_splice_read_pipe+0xcb0/0xcb0 vfs_read+0x16b/0x490 ksys_read+0x105/0x210 ? __ia32_sys_pwrite64+0x200/0x200 ? switch_fpu_return+0x108/0x220 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x61/0xc6 Through the vmcore, I found it's because in tracing_read_pipe(), ring_buffer_empty_cpu() found some buffer is not empty but then it cannot read anything due to "rb_num_of_entries() == 0" always true, Then it infinitely loop the procedure due to user buffer not been filled, see following code path: tracing_read_pipe() { ... ... waitagain: tracing_wait_pipe() // 1. find non-empty buffer here trace_find_next_entry_inc() // 2. loop here try to find an entry __find_next_entry() ring_buffer_empty_cpu(); // 3. find non-empty buffer peek_next_entry() // 4. but peek always return NULL ring_buffer_peek() rb_buffer_peek() rb_get_reader_page() // 5. because rb_num_of_entries() == 0 always true here // then return NULL // 6. user buffer not been filled so goto 'waitgain' // and eventually leads to an deadloop in kernel!!! } By some analyzing, I found that when resetting ringbuffer, the 'entries' of its pages are not all cleared (see rb_reset_cpu()). Then when reducing the ringbuffer, and if some reduced pages exist dirty 'entries' data, they will be added into 'cpu_buffer->overrun' (see rb_remove_pages()), which cause wrong 'overrun' count and eventually cause the deadloop issue. To fix it, we need to clear every pages in rb_reset_cpu(). Link: https://lore.kernel.org/linux-trace-kernel/20230708225144.3785600-1-zhengyejian1@huawei.com Cc: stable@vger.kernel.org Fixes: a5fb833172eca ("ring-buffer: Fix uninitialized read_stamp") Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-07-09 06:51:44 +08:00
rb_clear_buffer_page(cpu_buffer->head_page);
list_for_each_entry(page, cpu_buffer->pages, list) {
rb_clear_buffer_page(page);
}
cpu_buffer->tail_page = cpu_buffer->head_page;
cpu_buffer->commit_page = cpu_buffer->head_page;
INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
INIT_LIST_HEAD(&cpu_buffer->new_pages);
ring-buffer: Fix deadloop issue on reading trace_pipe Soft lockup occurs when reading file 'trace_pipe': watchdog: BUG: soft lockup - CPU#6 stuck for 22s! [cat:4488] [...] RIP: 0010:ring_buffer_empty_cpu+0xed/0x170 RSP: 0018:ffff88810dd6fc48 EFLAGS: 00000246 RAX: 0000000000000000 RBX: 0000000000000246 RCX: ffffffff93d1aaeb RDX: ffff88810a280040 RSI: 0000000000000008 RDI: ffff88811164b218 RBP: ffff88811164b218 R08: 0000000000000000 R09: ffff88815156600f R10: ffffed102a2acc01 R11: 0000000000000001 R12: 0000000051651901 R13: 0000000000000000 R14: ffff888115e49500 R15: 0000000000000000 [...] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f8d853c2000 CR3: 000000010dcd8000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __find_next_entry+0x1a8/0x4b0 ? peek_next_entry+0x250/0x250 ? down_write+0xa5/0x120 ? down_write_killable+0x130/0x130 trace_find_next_entry_inc+0x3b/0x1d0 tracing_read_pipe+0x423/0xae0 ? tracing_splice_read_pipe+0xcb0/0xcb0 vfs_read+0x16b/0x490 ksys_read+0x105/0x210 ? __ia32_sys_pwrite64+0x200/0x200 ? switch_fpu_return+0x108/0x220 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x61/0xc6 Through the vmcore, I found it's because in tracing_read_pipe(), ring_buffer_empty_cpu() found some buffer is not empty but then it cannot read anything due to "rb_num_of_entries() == 0" always true, Then it infinitely loop the procedure due to user buffer not been filled, see following code path: tracing_read_pipe() { ... ... waitagain: tracing_wait_pipe() // 1. find non-empty buffer here trace_find_next_entry_inc() // 2. loop here try to find an entry __find_next_entry() ring_buffer_empty_cpu(); // 3. find non-empty buffer peek_next_entry() // 4. but peek always return NULL ring_buffer_peek() rb_buffer_peek() rb_get_reader_page() // 5. because rb_num_of_entries() == 0 always true here // then return NULL // 6. user buffer not been filled so goto 'waitgain' // and eventually leads to an deadloop in kernel!!! } By some analyzing, I found that when resetting ringbuffer, the 'entries' of its pages are not all cleared (see rb_reset_cpu()). Then when reducing the ringbuffer, and if some reduced pages exist dirty 'entries' data, they will be added into 'cpu_buffer->overrun' (see rb_remove_pages()), which cause wrong 'overrun' count and eventually cause the deadloop issue. To fix it, we need to clear every pages in rb_reset_cpu(). Link: https://lore.kernel.org/linux-trace-kernel/20230708225144.3785600-1-zhengyejian1@huawei.com Cc: stable@vger.kernel.org Fixes: a5fb833172eca ("ring-buffer: Fix uninitialized read_stamp") Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-07-09 06:51:44 +08:00
rb_clear_buffer_page(cpu_buffer->reader_page);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
local_set(&cpu_buffer->entries_bytes, 0);
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
local_set(&cpu_buffer->overrun, 0);
local_set(&cpu_buffer->commit_overrun, 0);
local_set(&cpu_buffer->dropped_events, 0);
local_set(&cpu_buffer->entries, 0);
ring-buffer: use commit counters for commit pointer accounting The ring buffer is made up of three sets of pointers. The head page pointer, which points to the next page for the reader to get. The commit pointer and commit index, which points to the page and index of the last committed write respectively. The tail pointer and tail index, which points to the page and the index of the last reserved data respectively (non committed). The commit pointer is only moved forward by the outer most writer. If a nested writer comes in, it will not move the pointer forward. The current implementation has a flaw. It assumes that the outer most writer successfully reserved data. There's a small race window where the outer most writer could find the tail pointer, but a nested writer could come in (via interrupt) and move the tail forward, and even the commit forward. The outer writer would not realized the commit moved forward and the accounting will break. This patch changes the design to use counters in the per cpu buffers to keep track of commits. The counters are incremented at the start of the commit, and decremented at the end. If the end commit counter is 1, then it moves the commit pointers. A loop is made to check for races between checking and moving the commit pointers. Only the outer commit should move the pointers anyway. The test of knowing if a reserve is equal to the last commit update is still needed to know for time keeping. The time code is much less racey than the commit updates. This change not only solves the mentioned race, but also makes the code simpler. [ Impact: fix commit race and simplify code ] Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-06-17 00:37:57 +08:00
local_set(&cpu_buffer->committing, 0);
local_set(&cpu_buffer->commits, 0);
local_set(&cpu_buffer->pages_touched, 0);
local_set(&cpu_buffer->pages_lost, 0);
local_set(&cpu_buffer->pages_read, 0);
cpu_buffer->last_pages_touch = 0;
cpu_buffer->shortest_full = 0;
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
cpu_buffer->read = 0;
cpu_buffer->read_bytes = 0;
ring-buffer: Add rb_time_t 64 bit operations for speeding up 32 bit After a discussion with the new time algorithm to have nested events still have proper time keeping but required using local64_t atomic operations. Mathieu was concerned about the performance this would have on 32 bit machines, as in most cases, atomic 64 bit operations on them can be expensive. As the ring buffer's timing needs do not require full features of local64_t, a wrapper is made to implement a new rb_time_t operation that uses two longs on 32 bit machines but still uses the local64_t operations on 64 bit machines. There's a switch that can be made in the file to force 64 bit to use the 32 bit version just for testing purposes. All reads do not need to succeed if a read happened while the stamp being read is in the process of being updated. The requirement is that all reads must succed that were done by an interrupting event (where this event was interrupted by another event that did the write). Or if the event itself did the write first. That is: rb_time_set(t, x) followed by rb_time_read(t) will always succeed (even if it gets interrupted by another event that writes to t. The result of the read will be either the previous set, or a set performed by an interrupting event. If the read is done by an event that interrupted another event that was in the process of setting the time stamp, and no other event came along to write to that time stamp, it will fail and the rb_time_read() will return that it failed (the value to read will be undefined). A set will always write to the time stamp and return with a valid time stamp, such that any read after it will be valid. A cmpxchg may fail if it interrupted an event that was in the process of updating the time stamp just like the reads do. Other than that, it will act like a normal cmpxchg. The way this works is that the rb_time_t is made of of three fields. A cnt, that gets updated atomically everyting a modification is made. A top that represents the most significant 30 bits of the time, and a bottom to represent the least significant 30 bits of the time. Notice, that the time values is only 60 bits long (where the ring buffer only uses 59 bits, which gives us 18 years of nanoseconds!). The top two bits of both the top and bottom is a 2 bit counter that gets set by the value of the least two significant bits of the cnt. A read of the top and the bottom where both the top and bottom have the same most significant top 2 bits, are considered a match and a valid 60 bit number can be created from it. If they do not match, then the number is considered invalid, and this must only happen if an event interrupted another event in the midst of updating the time stamp. This is only used for 32 bits machines as 64 bit machines can get better performance out of the local64_t. This has been tested heavily by forcing 64 bit to use this logic. Link: https://lore.kernel.org/r/20200625225345.18cf5881@oasis.local.home Link: http://lkml.kernel.org/r/20200629025259.309232719@goodmis.org Inspired-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2020-06-29 10:52:27 +08:00
rb_time_set(&cpu_buffer->write_stamp, 0);
rb_time_set(&cpu_buffer->before_stamp, 0);
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
memset(cpu_buffer->event_stamp, 0, sizeof(cpu_buffer->event_stamp));
ring-buffer: Add place holder recording of dropped events Currently, when the ring buffer drops events, it does not record the fact that it did so. It does inform the writer that the event was dropped by returning a NULL event, but it does not put in any place holder where the event was dropped. This is not a trivial thing to add because the ring buffer mostly runs in overwrite (flight recorder) mode. That is, when the ring buffer is full, new data will overwrite old data. In a produce/consumer mode, where new data is simply dropped when the ring buffer is full, it is trivial to add the placeholder for dropped events. When there's more room to write new data, then a special event can be added to notify the reader about the dropped events. But in overwrite mode, any new write can overwrite events. A place holder can not be inserted into the ring buffer since there never may be room. A reader could also come in at anytime and miss the placeholder. Luckily, the way the ring buffer works, the read side can find out if events were lost or not, and how many events. Everytime a write takes place, if it overwrites the header page (the next read) it updates a "overrun" variable that keeps track of the number of lost events. When a reader swaps out a page from the ring buffer, it can record this number, perfom the swap, and then check to see if the number changed, and take the diff if it has, which would be the number of events dropped. This can be stored by the reader and returned to callers of the reader. Since the reader page swap will fail if the writer moved the head page since the time the reader page set up the swap, this gives room to record the overruns without worrying about races. If the reader sets up the pages, records the overrun, than performs the swap, if the swap succeeds, then the overrun variable has not been updated since the setup before the swap. For binary readers of the ring buffer, a flag is set in the header of each sub page (sub buffer) of the ring buffer. This flag is embedded in the size field of the data on the sub buffer, in the 31st bit (the size can be 32 or 64 bits depending on the architecture), but only 27 bits needs to be used for the actual size (less actually). We could add a new field in the sub buffer header to also record the number of events dropped since the last read, but this will change the format of the binary ring buffer a bit too much. Perhaps this change can be made if the information on the number of events dropped is considered important enough. Note, the notification of dropped events is only used by consuming reads or peeking at the ring buffer. Iterating over the ring buffer does not keep this information because the necessary data is only available when a page swap is made, and the iterator does not swap out pages. Cc: Robert Richter <robert.richter@amd.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: "Luis Claudio R. Goncalves" <lclaudio@uudg.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-04-01 01:21:56 +08:00
cpu_buffer->lost_events = 0;
cpu_buffer->last_overrun = 0;
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
rb_head_page_activate(cpu_buffer);
cpu_buffer->pages_removed = 0;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
/* Must have disabled the cpu buffer then done a synchronize_rcu */
static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
{
unsigned long flags;
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
goto out;
arch_spin_lock(&cpu_buffer->lock);
rb_reset_cpu(cpu_buffer);
arch_spin_unlock(&cpu_buffer->lock);
out:
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
}
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/**
* ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
* @buffer: The ring buffer to reset a per cpu buffer of
* @cpu: The CPU buffer to be reset
*/
void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/* prevent another thread from changing buffer sizes */
mutex_lock(&buffer->mutex);
atomic_inc(&cpu_buffer->resize_disabled);
atomic_inc(&cpu_buffer->record_disabled);
/* Make sure all commits have finished */
synchronize_rcu();
reset_disabled_cpu_buffer(cpu_buffer);
atomic_dec(&cpu_buffer->record_disabled);
atomic_dec(&cpu_buffer->resize_disabled);
mutex_unlock(&buffer->mutex);
}
EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
/* Flag to ensure proper resetting of atomic variables */
#define RESET_BIT (1 << 30)
/**
* ring_buffer_reset_online_cpus - reset a ring buffer per CPU buffer
* @buffer: The ring buffer to reset a per cpu buffer of
*/
void ring_buffer_reset_online_cpus(struct trace_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
int cpu;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/* prevent another thread from changing buffer sizes */
mutex_lock(&buffer->mutex);
for_each_online_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
atomic_add(RESET_BIT, &cpu_buffer->resize_disabled);
atomic_inc(&cpu_buffer->record_disabled);
}
/* Make sure all commits have finished */
synchronize_rcu();
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
/*
* If a CPU came online during the synchronize_rcu(), then
* ignore it.
*/
if (!(atomic_read(&cpu_buffer->resize_disabled) & RESET_BIT))
continue;
reset_disabled_cpu_buffer(cpu_buffer);
atomic_dec(&cpu_buffer->record_disabled);
atomic_sub(RESET_BIT, &cpu_buffer->resize_disabled);
}
mutex_unlock(&buffer->mutex);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
/**
* ring_buffer_reset - reset a ring buffer
* @buffer: The ring buffer to reset all cpu buffers
*/
void ring_buffer_reset(struct trace_buffer *buffer)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
struct ring_buffer_per_cpu *cpu_buffer;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
int cpu;
/* prevent another thread from changing buffer sizes */
mutex_lock(&buffer->mutex);
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
atomic_inc(&cpu_buffer->resize_disabled);
atomic_inc(&cpu_buffer->record_disabled);
}
/* Make sure all commits have finished */
synchronize_rcu();
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
reset_disabled_cpu_buffer(cpu_buffer);
atomic_dec(&cpu_buffer->record_disabled);
atomic_dec(&cpu_buffer->resize_disabled);
}
mutex_unlock(&buffer->mutex);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
EXPORT_SYMBOL_GPL(ring_buffer_reset);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/**
* ring_buffer_empty - is the ring buffer empty?
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
* @buffer: The ring buffer to test
*/
bool ring_buffer_empty(struct trace_buffer *buffer)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
bool dolock;
bool ret;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
int cpu;
/* yes this is racy, but if you don't like the race, lock the buffer */
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
local_irq_save(flags);
dolock = rb_reader_lock(cpu_buffer);
ret = rb_per_cpu_empty(cpu_buffer);
rb_reader_unlock(cpu_buffer, dolock);
local_irq_restore(flags);
if (!ret)
return false;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
return true;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
EXPORT_SYMBOL_GPL(ring_buffer_empty);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/**
* ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty?
* @buffer: The ring buffer
* @cpu: The CPU buffer to test
*/
bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
bool dolock;
bool ret;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return true;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
cpu_buffer = buffer->buffers[cpu];
local_irq_save(flags);
dolock = rb_reader_lock(cpu_buffer);
ret = rb_per_cpu_empty(cpu_buffer);
rb_reader_unlock(cpu_buffer, dolock);
local_irq_restore(flags);
return ret;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/**
* ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
* @buffer_a: One buffer to swap with
* @buffer_b: The other buffer to swap with
* @cpu: the CPU of the buffers to swap
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
*
* This function is useful for tracers that want to take a "snapshot"
* of a CPU buffer and has another back up buffer lying around.
* it is expected that the tracer handles the cpu buffer not being
* used at the moment.
*/
int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
struct trace_buffer *buffer_b, int cpu)
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
{
struct ring_buffer_per_cpu *cpu_buffer_a;
struct ring_buffer_per_cpu *cpu_buffer_b;
int ret = -EINVAL;
if (!cpumask_test_cpu(cpu, buffer_a->cpumask) ||
!cpumask_test_cpu(cpu, buffer_b->cpumask))
goto out;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
cpu_buffer_a = buffer_a->buffers[cpu];
cpu_buffer_b = buffer_b->buffers[cpu];
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/* At least make sure the two buffers are somewhat the same */
if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
goto out;
ret = -EAGAIN;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
if (atomic_read(&buffer_a->record_disabled))
goto out;
if (atomic_read(&buffer_b->record_disabled))
goto out;
if (atomic_read(&cpu_buffer_a->record_disabled))
goto out;
if (atomic_read(&cpu_buffer_b->record_disabled))
goto out;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/*
* We can't do a synchronize_rcu here because this
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
* function can be called in atomic context.
* Normally this will be called from the same CPU as cpu.
* If not it's up to the caller to protect this.
*/
atomic_inc(&cpu_buffer_a->record_disabled);
atomic_inc(&cpu_buffer_b->record_disabled);
ret = -EBUSY;
if (local_read(&cpu_buffer_a->committing))
goto out_dec;
if (local_read(&cpu_buffer_b->committing))
goto out_dec;
ring-buffer: Do not swap cpu_buffer during resize process When ring_buffer_swap_cpu was called during resize process, the cpu buffer was swapped in the middle, resulting in incorrect state. Continuing to run in the wrong state will result in oops. This issue can be easily reproduced using the following two scripts: /tmp # cat test1.sh //#! /bin/sh for i in `seq 0 100000` do echo 2000 > /sys/kernel/debug/tracing/buffer_size_kb sleep 0.5 echo 5000 > /sys/kernel/debug/tracing/buffer_size_kb sleep 0.5 done /tmp # cat test2.sh //#! /bin/sh for i in `seq 0 100000` do echo irqsoff > /sys/kernel/debug/tracing/current_tracer sleep 1 echo nop > /sys/kernel/debug/tracing/current_tracer sleep 1 done /tmp # ./test1.sh & /tmp # ./test2.sh & A typical oops log is as follows, sometimes with other different oops logs. [ 231.711293] WARNING: CPU: 0 PID: 9 at kernel/trace/ring_buffer.c:2026 rb_update_pages+0x378/0x3f8 [ 231.713375] Modules linked in: [ 231.714735] CPU: 0 PID: 9 Comm: kworker/0:1 Tainted: G W 6.5.0-rc1-00276-g20edcec23f92 #15 [ 231.716750] Hardware name: linux,dummy-virt (DT) [ 231.718152] Workqueue: events update_pages_handler [ 231.719714] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 231.721171] pc : rb_update_pages+0x378/0x3f8 [ 231.722212] lr : rb_update_pages+0x25c/0x3f8 [ 231.723248] sp : ffff800082b9bd50 [ 231.724169] x29: ffff800082b9bd50 x28: ffff8000825f7000 x27: 0000000000000000 [ 231.726102] x26: 0000000000000001 x25: fffffffffffff010 x24: 0000000000000ff0 [ 231.728122] x23: ffff0000c3a0b600 x22: ffff0000c3a0b5c0 x21: fffffffffffffe0a [ 231.730203] x20: ffff0000c3a0b600 x19: ffff0000c0102400 x18: 0000000000000000 [ 231.732329] x17: 0000000000000000 x16: 0000000000000000 x15: 0000ffffe7aa8510 [ 231.734212] x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000002 [ 231.736291] x11: ffff8000826998a8 x10: ffff800082b9baf0 x9 : ffff800081137558 [ 231.738195] x8 : fffffc00030e82c8 x7 : 0000000000000000 x6 : 0000000000000001 [ 231.740192] x5 : ffff0000ffbafe00 x4 : 0000000000000000 x3 : 0000000000000000 [ 231.742118] x2 : 00000000000006aa x1 : 0000000000000001 x0 : ffff0000c0007208 [ 231.744196] Call trace: [ 231.744892] rb_update_pages+0x378/0x3f8 [ 231.745893] update_pages_handler+0x1c/0x38 [ 231.746893] process_one_work+0x1f0/0x468 [ 231.747852] worker_thread+0x54/0x410 [ 231.748737] kthread+0x124/0x138 [ 231.749549] ret_from_fork+0x10/0x20 [ 231.750434] ---[ end trace 0000000000000000 ]--- [ 233.720486] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 [ 233.721696] Mem abort info: [ 233.721935] ESR = 0x0000000096000004 [ 233.722283] EC = 0x25: DABT (current EL), IL = 32 bits [ 233.722596] SET = 0, FnV = 0 [ 233.722805] EA = 0, S1PTW = 0 [ 233.723026] FSC = 0x04: level 0 translation fault [ 233.723458] Data abort info: [ 233.723734] ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000 [ 233.724176] CM = 0, WnR = 0, TnD = 0, TagAccess = 0 [ 233.724589] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 233.725075] user pgtable: 4k pages, 48-bit VAs, pgdp=0000000104943000 [ 233.725592] [0000000000000000] pgd=0000000000000000, p4d=0000000000000000 [ 233.726231] Internal error: Oops: 0000000096000004 [#1] PREEMPT SMP [ 233.726720] Modules linked in: [ 233.727007] CPU: 0 PID: 9 Comm: kworker/0:1 Tainted: G W 6.5.0-rc1-00276-g20edcec23f92 #15 [ 233.727777] Hardware name: linux,dummy-virt (DT) [ 233.728225] Workqueue: events update_pages_handler [ 233.728655] pstate: 200000c5 (nzCv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 233.729054] pc : rb_update_pages+0x1a8/0x3f8 [ 233.729334] lr : rb_update_pages+0x154/0x3f8 [ 233.729592] sp : ffff800082b9bd50 [ 233.729792] x29: ffff800082b9bd50 x28: ffff8000825f7000 x27: 0000000000000000 [ 233.730220] x26: 0000000000000000 x25: ffff800082a8b840 x24: ffff0000c0102418 [ 233.730653] x23: 0000000000000000 x22: fffffc000304c880 x21: 0000000000000003 [ 233.731105] x20: 00000000000001f4 x19: ffff0000c0102400 x18: ffff800082fcbc58 [ 233.731727] x17: 0000000000000000 x16: 0000000000000001 x15: 0000000000000001 [ 233.732282] x14: ffff8000825fe0c8 x13: 0000000000000001 x12: 0000000000000000 [ 233.732709] x11: ffff8000826998a8 x10: 0000000000000ae0 x9 : ffff8000801b760c [ 233.733148] x8 : fefefefefefefeff x7 : 0000000000000018 x6 : ffff0000c03298c0 [ 233.733553] x5 : 0000000000000002 x4 : 0000000000000000 x3 : 0000000000000000 [ 233.733972] x2 : ffff0000c3a0b600 x1 : 0000000000000000 x0 : 0000000000000000 [ 233.734418] Call trace: [ 233.734593] rb_update_pages+0x1a8/0x3f8 [ 233.734853] update_pages_handler+0x1c/0x38 [ 233.735148] process_one_work+0x1f0/0x468 [ 233.735525] worker_thread+0x54/0x410 [ 233.735852] kthread+0x124/0x138 [ 233.736064] ret_from_fork+0x10/0x20 [ 233.736387] Code: 92400000 910006b5 aa000021 aa0303f7 (f9400060) [ 233.736959] ---[ end trace 0000000000000000 ]--- After analysis, the seq of the error is as follows [1-5]: int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, int cpu_id) { for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; //1. get cpu_buffer, aka cpu_buffer(A) ... ... schedule_work_on(cpu, &cpu_buffer->update_pages_work); //2. 'update_pages_work' is queue on 'cpu', cpu_buffer(A) is passed to // update_pages_handler, do the update process, set 'update_done' in // complete(&cpu_buffer->update_done) and to wakeup resize process. //----> //3. Just at this moment, ring_buffer_swap_cpu is triggered, //cpu_buffer(A) be swaped to cpu_buffer(B), the max_buffer. //ring_buffer_swap_cpu is called as the 'Call trace' below. Call trace: dump_backtrace+0x0/0x2f8 show_stack+0x18/0x28 dump_stack+0x12c/0x188 ring_buffer_swap_cpu+0x2f8/0x328 update_max_tr_single+0x180/0x210 check_critical_timing+0x2b4/0x2c8 tracer_hardirqs_on+0x1c0/0x200 trace_hardirqs_on+0xec/0x378 el0_svc_common+0x64/0x260 do_el0_svc+0x90/0xf8 el0_svc+0x20/0x30 el0_sync_handler+0xb0/0xb8 el0_sync+0x180/0x1c0 //<---- /* wait for all the updates to complete */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; //4. get cpu_buffer, cpu_buffer(B) is used in the following process, //the state of cpu_buffer(A) and cpu_buffer(B) is totally wrong. //for example, cpu_buffer(A)->update_done will leave be set 1, and will //not 'wait_for_completion' at the next resize round. if (!cpu_buffer->nr_pages_to_update) continue; if (cpu_online(cpu)) wait_for_completion(&cpu_buffer->update_done); cpu_buffer->nr_pages_to_update = 0; } ... } //5. the state of cpu_buffer(A) and cpu_buffer(B) is totally wrong, //Continuing to run in the wrong state, then oops occurs. Link: https://lore.kernel.org/linux-trace-kernel/202307191558478409990@zte.com.cn Signed-off-by: Chen Lin <chen.lin5@zte.com.cn> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-07-19 15:58:47 +08:00
/*
* When resize is in progress, we cannot swap it because
* it will mess the state of the cpu buffer.
*/
if (atomic_read(&buffer_a->resizing))
goto out_dec;
if (atomic_read(&buffer_b->resizing))
goto out_dec;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
buffer_a->buffers[cpu] = cpu_buffer_b;
buffer_b->buffers[cpu] = cpu_buffer_a;
cpu_buffer_b->buffer = buffer_a;
cpu_buffer_a->buffer = buffer_b;
ret = 0;
out_dec:
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
atomic_dec(&cpu_buffer_a->record_disabled);
atomic_dec(&cpu_buffer_b->record_disabled);
out:
return ret;
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
}
EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
#endif /* CONFIG_RING_BUFFER_ALLOW_SWAP */
tracing: unified trace buffer This is a unified tracing buffer that implements a ring buffer that hopefully everyone will eventually be able to use. The events recorded into the buffer have the following structure: struct ring_buffer_event { u32 type:2, len:3, time_delta:27; u32 array[]; }; The minimum size of an event is 8 bytes. All events are 4 byte aligned inside the buffer. There are 4 types (all internal use for the ring buffer, only the data type is exported to the interface users). RINGBUF_TYPE_PADDING: this type is used to note extra space at the end of a buffer page. RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events is greater than the 27 bit delta can hold. We add another 32 bits, and record that in its own event (8 byte size). RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to help keep the buffer timestamps in sync. RINGBUF_TYPE_DATA: The event actually holds user data. The "len" field is only three bits. Since the data must be 4 byte aligned, this field is shifted left by 2, giving a max length of 28 bytes. If the data load is greater than 28 bytes, the first array field holds the full length of the data load and the len field is set to zero. Example, data size of 7 bytes: type = RINGBUF_TYPE_DATA len = 2 time_delta: <time-stamp> - <prev_event-time-stamp> array[0..1]: <7 bytes of data> <1 byte empty> This event is saved in 12 bytes of the buffer. An event with 82 bytes of data: type = RINGBUF_TYPE_DATA len = 0 time_delta: <time-stamp> - <prev_event-time-stamp> array[0]: 84 (Note the alignment) array[1..14]: <82 bytes of data> <2 bytes empty> The above event is saved in 92 bytes (if my math is correct). 82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length. Do not reference the above event struct directly. Use the following functions to gain access to the event table, since the ring_buffer_event structure may change in the future. ring_buffer_event_length(event): get the length of the event. This is the size of the memory used to record this event, and not the size of the data pay load. ring_buffer_time_delta(event): get the time delta of the event This returns the delta time stamp since the last event. Note: Even though this is in the header, there should be no reason to access this directly, accept for debugging. ring_buffer_event_data(event): get the data from the event This is the function to use to get the actual data from the event. Note, it is only a pointer to the data inside the buffer. This data must be copied to another location otherwise you risk it being written over in the buffer. ring_buffer_lock: A way to lock the entire buffer. ring_buffer_unlock: unlock the buffer. ring_buffer_alloc: create a new ring buffer. Can choose between overwrite or consumer/producer mode. Overwrite will overwrite old data, where as consumer producer will throw away new data if the consumer catches up with the producer. The consumer/producer is the default. ring_buffer_free: free the ring buffer. ring_buffer_resize: resize the buffer. Changes the size of each cpu buffer. Note, it is up to the caller to provide that the buffer is not being used while this is happening. This requirement may go away but do not count on it. ring_buffer_lock_reserve: locks the ring buffer and allocates an entry on the buffer to write to. ring_buffer_unlock_commit: unlocks the ring buffer and commits it to the buffer. ring_buffer_write: writes some data into the ring buffer. ring_buffer_peek: Look at a next item in the cpu buffer. ring_buffer_consume: get the next item in the cpu buffer and consume it. That is, this function increments the head pointer. ring_buffer_read_start: Start an iterator of a cpu buffer. For now, this disables the cpu buffer, until you issue a finish. This is just because we do not want the iterator to be overwritten. This restriction may change in the future. But note, this is used for static reading of a buffer which is usually done "after" a trace. Live readings would want to use the ring_buffer_consume above, which will not disable the ring buffer. ring_buffer_read_finish: Finishes the read iterator and reenables the ring buffer. ring_buffer_iter_peek: Look at the next item in the cpu iterator. ring_buffer_read: Read the iterator and increment it. ring_buffer_iter_reset: Reset the iterator to point to the beginning of the cpu buffer. ring_buffer_iter_empty: Returns true if the iterator is at the end of the cpu buffer. ring_buffer_size: returns the size in bytes of each cpu buffer. Note, the real size is this times the number of CPUs. ring_buffer_reset_cpu: Sets the cpu buffer to empty ring_buffer_reset: sets all cpu buffers to empty ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a cpu buffer of another buffer. This is handy when you want to take a snap shot of a running trace on just one cpu. Having a backup buffer, to swap with facilitates this. Ftrace max latencies use this. ring_buffer_empty: Returns true if the ring buffer is empty. ring_buffer_empty_cpu: Returns true if the cpu buffer is empty. ring_buffer_record_disable: disable all cpu buffers (read only) ring_buffer_record_disable_cpu: disable a single cpu buffer (read only) ring_buffer_record_enable: enable all cpu buffers. ring_buffer_record_enabl_cpu: enable a single cpu buffer. ring_buffer_entries: The number of entries in a ring buffer. ring_buffer_overruns: The number of entries removed due to writing wrap. ring_buffer_time_stamp: Get the time stamp used by the ring buffer ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp into nanosecs. I still need to implement the GTOD feature. But we need support from the cpu frequency infrastructure. But this can be done at a later time without affecting the ring buffer interface. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-30 11:02:38 +08:00
/**
* ring_buffer_alloc_read_page - allocate a page to read from buffer
* @buffer: the buffer to allocate for.
* @cpu: the cpu buffer to allocate.
*
* This function is used in conjunction with ring_buffer_read_page.
* When reading a full page from the ring buffer, these functions
* can be used to speed up the process. The calling function should
* allocate a few pages first with this function. Then when it
* needs to get pages from the ring buffer, it passes the result
* of this function into ring_buffer_read_page, which will swap
* the page that was allocated, with the read page of the buffer.
*
* Returns:
ring-buffer: Have ring_buffer_alloc_read_page() return error on offline CPU Chunyu Hu reported: "per_cpu trace directories and files are created for all possible cpus, but only the cpus which have ever been on-lined have their own per cpu ring buffer (allocated by cpuhp threads). While trace_buffers_open, the open handler for trace file 'trace_pipe_raw' is always trying to access field of ring_buffer_per_cpu, and would panic with the NULL pointer. Align the behavior of trace_pipe_raw with trace_pipe, that returns -NODEV when openning it if that cpu does not have trace ring buffer. Reproduce: cat /sys/kernel/debug/tracing/per_cpu/cpu31/trace_pipe_raw (cpu31 is never on-lined, this is a 16 cores x86_64 box) Tested with: 1) boot with maxcpus=14, read trace_pipe_raw of cpu15. Got -NODEV. 2) oneline cpu15, read trace_pipe_raw of cpu15. Get the raw trace data. Call trace: [ 5760.950995] RIP: 0010:ring_buffer_alloc_read_page+0x32/0xe0 [ 5760.961678] tracing_buffers_read+0x1f6/0x230 [ 5760.962695] __vfs_read+0x37/0x160 [ 5760.963498] ? __vfs_read+0x5/0x160 [ 5760.964339] ? security_file_permission+0x9d/0xc0 [ 5760.965451] ? __vfs_read+0x5/0x160 [ 5760.966280] vfs_read+0x8c/0x130 [ 5760.967070] SyS_read+0x55/0xc0 [ 5760.967779] do_syscall_64+0x67/0x150 [ 5760.968687] entry_SYSCALL64_slow_path+0x25/0x25" This was introduced by the addition of the feature to reuse reader pages instead of re-allocating them. The problem is that the allocation of a reader page (which is per cpu) does not check if the cpu is online and set up for the ring buffer. Link: http://lkml.kernel.org/r/1500880866-1177-1-git-send-email-chuhu@redhat.com Cc: stable@vger.kernel.org Fixes: 73a757e63114 ("ring-buffer: Return reader page back into existing ring buffer") Reported-by: Chunyu Hu <chuhu@redhat.com> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-08-03 02:20:54 +08:00
* The page allocated, or ERR_PTR
*/
void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
{
ring-buffer: Have ring_buffer_alloc_read_page() return error on offline CPU Chunyu Hu reported: "per_cpu trace directories and files are created for all possible cpus, but only the cpus which have ever been on-lined have their own per cpu ring buffer (allocated by cpuhp threads). While trace_buffers_open, the open handler for trace file 'trace_pipe_raw' is always trying to access field of ring_buffer_per_cpu, and would panic with the NULL pointer. Align the behavior of trace_pipe_raw with trace_pipe, that returns -NODEV when openning it if that cpu does not have trace ring buffer. Reproduce: cat /sys/kernel/debug/tracing/per_cpu/cpu31/trace_pipe_raw (cpu31 is never on-lined, this is a 16 cores x86_64 box) Tested with: 1) boot with maxcpus=14, read trace_pipe_raw of cpu15. Got -NODEV. 2) oneline cpu15, read trace_pipe_raw of cpu15. Get the raw trace data. Call trace: [ 5760.950995] RIP: 0010:ring_buffer_alloc_read_page+0x32/0xe0 [ 5760.961678] tracing_buffers_read+0x1f6/0x230 [ 5760.962695] __vfs_read+0x37/0x160 [ 5760.963498] ? __vfs_read+0x5/0x160 [ 5760.964339] ? security_file_permission+0x9d/0xc0 [ 5760.965451] ? __vfs_read+0x5/0x160 [ 5760.966280] vfs_read+0x8c/0x130 [ 5760.967070] SyS_read+0x55/0xc0 [ 5760.967779] do_syscall_64+0x67/0x150 [ 5760.968687] entry_SYSCALL64_slow_path+0x25/0x25" This was introduced by the addition of the feature to reuse reader pages instead of re-allocating them. The problem is that the allocation of a reader page (which is per cpu) does not check if the cpu is online and set up for the ring buffer. Link: http://lkml.kernel.org/r/1500880866-1177-1-git-send-email-chuhu@redhat.com Cc: stable@vger.kernel.org Fixes: 73a757e63114 ("ring-buffer: Return reader page back into existing ring buffer") Reported-by: Chunyu Hu <chuhu@redhat.com> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-08-03 02:20:54 +08:00
struct ring_buffer_per_cpu *cpu_buffer;
ring-buffer: Return reader page back into existing ring buffer When reading the ring buffer for consuming, it is optimized for splice, where a page is taken out of the ring buffer (zero copy) and sent to the reading consumer. When the read is finished with the page, it calls ring_buffer_free_read_page(), which simply frees the page. The next time the reader needs to get a page from the ring buffer, it must call ring_buffer_alloc_read_page() which allocates and initializes a reader page for the ring buffer to be swapped into the ring buffer for a new filled page for the reader. The problem is that there's no reason to actually free the page when it is passed back to the ring buffer. It can hold it off and reuse it for the next iteration. This completely removes the interaction with the page_alloc mechanism. Using the trace-cmd utility to record all events (causing trace-cmd to require reading lots of pages from the ring buffer, and calling ring_buffer_alloc/free_read_page() several times), and also assigning a stack trace trigger to the mm_page_alloc event, we can see how many times the ring_buffer_alloc_read_page() needed to allocate a page for the ring buffer. Before this change: # trace-cmd record -e all -e mem_page_alloc -R stacktrace sleep 1 # trace-cmd report |grep ring_buffer_alloc_read_page | wc -l 9968 After this change: # trace-cmd record -e all -e mem_page_alloc -R stacktrace sleep 1 # trace-cmd report |grep ring_buffer_alloc_read_page | wc -l 4 Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-05-01 21:35:09 +08:00
struct buffer_data_page *bpage = NULL;
unsigned long flags;
tracing: Use NUMA allocation for per-cpu ring buffer pages The tracing ring buffer is a group of per-cpu ring buffers where allocation and logging is done on a per-cpu basis. The events that are generated on a particular CPU are logged in the corresponding buffer. This is to provide wait-free writes between CPUs and good NUMA node locality while accessing the ring buffer. However, the allocation routines consider NUMA locality only for buffer page metadata and not for the actual buffer page. This causes the pages to be allocated on the NUMA node local to the CPU where the allocation routine is running at the time. This patch fixes the problem by using a NUMA node specific allocation routine so that the pages are allocated from a NUMA node local to the logging CPU. I tested with the getuid_microbench from autotest. It is a simple binary that calls getuid() in a loop and measures the average time for the syscall to complete. The following command was used to test: $ getuid_microbench 1000000 Compared the numbers found on kernel with and without this patch and found that logging latency decreases by 30-50 ns/call. tracing with non-NUMA allocation - 569 ns/call tracing with NUMA allocation - 512 ns/call Signed-off-by: Vaibhav Nagarnaik <vnagarnaik@google.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Michael Rubin <mrubin@google.com> Cc: David Sharp <dhsharp@google.com> Link: http://lkml.kernel.org/r/1304470602-20366-1-git-send-email-vnagarnaik@google.com Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2011-05-04 08:56:42 +08:00
struct page *page;
ring-buffer: Have ring_buffer_alloc_read_page() return error on offline CPU Chunyu Hu reported: "per_cpu trace directories and files are created for all possible cpus, but only the cpus which have ever been on-lined have their own per cpu ring buffer (allocated by cpuhp threads). While trace_buffers_open, the open handler for trace file 'trace_pipe_raw' is always trying to access field of ring_buffer_per_cpu, and would panic with the NULL pointer. Align the behavior of trace_pipe_raw with trace_pipe, that returns -NODEV when openning it if that cpu does not have trace ring buffer. Reproduce: cat /sys/kernel/debug/tracing/per_cpu/cpu31/trace_pipe_raw (cpu31 is never on-lined, this is a 16 cores x86_64 box) Tested with: 1) boot with maxcpus=14, read trace_pipe_raw of cpu15. Got -NODEV. 2) oneline cpu15, read trace_pipe_raw of cpu15. Get the raw trace data. Call trace: [ 5760.950995] RIP: 0010:ring_buffer_alloc_read_page+0x32/0xe0 [ 5760.961678] tracing_buffers_read+0x1f6/0x230 [ 5760.962695] __vfs_read+0x37/0x160 [ 5760.963498] ? __vfs_read+0x5/0x160 [ 5760.964339] ? security_file_permission+0x9d/0xc0 [ 5760.965451] ? __vfs_read+0x5/0x160 [ 5760.966280] vfs_read+0x8c/0x130 [ 5760.967070] SyS_read+0x55/0xc0 [ 5760.967779] do_syscall_64+0x67/0x150 [ 5760.968687] entry_SYSCALL64_slow_path+0x25/0x25" This was introduced by the addition of the feature to reuse reader pages instead of re-allocating them. The problem is that the allocation of a reader page (which is per cpu) does not check if the cpu is online and set up for the ring buffer. Link: http://lkml.kernel.org/r/1500880866-1177-1-git-send-email-chuhu@redhat.com Cc: stable@vger.kernel.org Fixes: 73a757e63114 ("ring-buffer: Return reader page back into existing ring buffer") Reported-by: Chunyu Hu <chuhu@redhat.com> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-08-03 02:20:54 +08:00
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return ERR_PTR(-ENODEV);
cpu_buffer = buffer->buffers[cpu];
ring-buffer: Return reader page back into existing ring buffer When reading the ring buffer for consuming, it is optimized for splice, where a page is taken out of the ring buffer (zero copy) and sent to the reading consumer. When the read is finished with the page, it calls ring_buffer_free_read_page(), which simply frees the page. The next time the reader needs to get a page from the ring buffer, it must call ring_buffer_alloc_read_page() which allocates and initializes a reader page for the ring buffer to be swapped into the ring buffer for a new filled page for the reader. The problem is that there's no reason to actually free the page when it is passed back to the ring buffer. It can hold it off and reuse it for the next iteration. This completely removes the interaction with the page_alloc mechanism. Using the trace-cmd utility to record all events (causing trace-cmd to require reading lots of pages from the ring buffer, and calling ring_buffer_alloc/free_read_page() several times), and also assigning a stack trace trigger to the mm_page_alloc event, we can see how many times the ring_buffer_alloc_read_page() needed to allocate a page for the ring buffer. Before this change: # trace-cmd record -e all -e mem_page_alloc -R stacktrace sleep 1 # trace-cmd report |grep ring_buffer_alloc_read_page | wc -l 9968 After this change: # trace-cmd record -e all -e mem_page_alloc -R stacktrace sleep 1 # trace-cmd report |grep ring_buffer_alloc_read_page | wc -l 4 Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-05-01 21:35:09 +08:00
local_irq_save(flags);
arch_spin_lock(&cpu_buffer->lock);
if (cpu_buffer->free_page) {
bpage = cpu_buffer->free_page;
cpu_buffer->free_page = NULL;
}
arch_spin_unlock(&cpu_buffer->lock);
local_irq_restore(flags);
if (bpage)
goto out;
page = alloc_pages_node(cpu_to_node(cpu),
GFP_KERNEL | __GFP_NORETRY, 0);
tracing: Use NUMA allocation for per-cpu ring buffer pages The tracing ring buffer is a group of per-cpu ring buffers where allocation and logging is done on a per-cpu basis. The events that are generated on a particular CPU are logged in the corresponding buffer. This is to provide wait-free writes between CPUs and good NUMA node locality while accessing the ring buffer. However, the allocation routines consider NUMA locality only for buffer page metadata and not for the actual buffer page. This causes the pages to be allocated on the NUMA node local to the CPU where the allocation routine is running at the time. This patch fixes the problem by using a NUMA node specific allocation routine so that the pages are allocated from a NUMA node local to the logging CPU. I tested with the getuid_microbench from autotest. It is a simple binary that calls getuid() in a loop and measures the average time for the syscall to complete. The following command was used to test: $ getuid_microbench 1000000 Compared the numbers found on kernel with and without this patch and found that logging latency decreases by 30-50 ns/call. tracing with non-NUMA allocation - 569 ns/call tracing with NUMA allocation - 512 ns/call Signed-off-by: Vaibhav Nagarnaik <vnagarnaik@google.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Michael Rubin <mrubin@google.com> Cc: David Sharp <dhsharp@google.com> Link: http://lkml.kernel.org/r/1304470602-20366-1-git-send-email-vnagarnaik@google.com Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2011-05-04 08:56:42 +08:00
if (!page)
ring-buffer: Have ring_buffer_alloc_read_page() return error on offline CPU Chunyu Hu reported: "per_cpu trace directories and files are created for all possible cpus, but only the cpus which have ever been on-lined have their own per cpu ring buffer (allocated by cpuhp threads). While trace_buffers_open, the open handler for trace file 'trace_pipe_raw' is always trying to access field of ring_buffer_per_cpu, and would panic with the NULL pointer. Align the behavior of trace_pipe_raw with trace_pipe, that returns -NODEV when openning it if that cpu does not have trace ring buffer. Reproduce: cat /sys/kernel/debug/tracing/per_cpu/cpu31/trace_pipe_raw (cpu31 is never on-lined, this is a 16 cores x86_64 box) Tested with: 1) boot with maxcpus=14, read trace_pipe_raw of cpu15. Got -NODEV. 2) oneline cpu15, read trace_pipe_raw of cpu15. Get the raw trace data. Call trace: [ 5760.950995] RIP: 0010:ring_buffer_alloc_read_page+0x32/0xe0 [ 5760.961678] tracing_buffers_read+0x1f6/0x230 [ 5760.962695] __vfs_read+0x37/0x160 [ 5760.963498] ? __vfs_read+0x5/0x160 [ 5760.964339] ? security_file_permission+0x9d/0xc0 [ 5760.965451] ? __vfs_read+0x5/0x160 [ 5760.966280] vfs_read+0x8c/0x130 [ 5760.967070] SyS_read+0x55/0xc0 [ 5760.967779] do_syscall_64+0x67/0x150 [ 5760.968687] entry_SYSCALL64_slow_path+0x25/0x25" This was introduced by the addition of the feature to reuse reader pages instead of re-allocating them. The problem is that the allocation of a reader page (which is per cpu) does not check if the cpu is online and set up for the ring buffer. Link: http://lkml.kernel.org/r/1500880866-1177-1-git-send-email-chuhu@redhat.com Cc: stable@vger.kernel.org Fixes: 73a757e63114 ("ring-buffer: Return reader page back into existing ring buffer") Reported-by: Chunyu Hu <chuhu@redhat.com> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-08-03 02:20:54 +08:00
return ERR_PTR(-ENOMEM);
tracing: Use NUMA allocation for per-cpu ring buffer pages The tracing ring buffer is a group of per-cpu ring buffers where allocation and logging is done on a per-cpu basis. The events that are generated on a particular CPU are logged in the corresponding buffer. This is to provide wait-free writes between CPUs and good NUMA node locality while accessing the ring buffer. However, the allocation routines consider NUMA locality only for buffer page metadata and not for the actual buffer page. This causes the pages to be allocated on the NUMA node local to the CPU where the allocation routine is running at the time. This patch fixes the problem by using a NUMA node specific allocation routine so that the pages are allocated from a NUMA node local to the logging CPU. I tested with the getuid_microbench from autotest. It is a simple binary that calls getuid() in a loop and measures the average time for the syscall to complete. The following command was used to test: $ getuid_microbench 1000000 Compared the numbers found on kernel with and without this patch and found that logging latency decreases by 30-50 ns/call. tracing with non-NUMA allocation - 569 ns/call tracing with NUMA allocation - 512 ns/call Signed-off-by: Vaibhav Nagarnaik <vnagarnaik@google.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Michael Rubin <mrubin@google.com> Cc: David Sharp <dhsharp@google.com> Link: http://lkml.kernel.org/r/1304470602-20366-1-git-send-email-vnagarnaik@google.com Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2011-05-04 08:56:42 +08:00
bpage = page_address(page);
ring-buffer: Return reader page back into existing ring buffer When reading the ring buffer for consuming, it is optimized for splice, where a page is taken out of the ring buffer (zero copy) and sent to the reading consumer. When the read is finished with the page, it calls ring_buffer_free_read_page(), which simply frees the page. The next time the reader needs to get a page from the ring buffer, it must call ring_buffer_alloc_read_page() which allocates and initializes a reader page for the ring buffer to be swapped into the ring buffer for a new filled page for the reader. The problem is that there's no reason to actually free the page when it is passed back to the ring buffer. It can hold it off and reuse it for the next iteration. This completely removes the interaction with the page_alloc mechanism. Using the trace-cmd utility to record all events (causing trace-cmd to require reading lots of pages from the ring buffer, and calling ring_buffer_alloc/free_read_page() several times), and also assigning a stack trace trigger to the mm_page_alloc event, we can see how many times the ring_buffer_alloc_read_page() needed to allocate a page for the ring buffer. Before this change: # trace-cmd record -e all -e mem_page_alloc -R stacktrace sleep 1 # trace-cmd report |grep ring_buffer_alloc_read_page | wc -l 9968 After this change: # trace-cmd record -e all -e mem_page_alloc -R stacktrace sleep 1 # trace-cmd report |grep ring_buffer_alloc_read_page | wc -l 4 Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-05-01 21:35:09 +08:00
out:
rb_init_page(bpage);
return bpage;
}
EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
/**
* ring_buffer_free_read_page - free an allocated read page
* @buffer: the buffer the page was allocate for
ring-buffer: Return reader page back into existing ring buffer When reading the ring buffer for consuming, it is optimized for splice, where a page is taken out of the ring buffer (zero copy) and sent to the reading consumer. When the read is finished with the page, it calls ring_buffer_free_read_page(), which simply frees the page. The next time the reader needs to get a page from the ring buffer, it must call ring_buffer_alloc_read_page() which allocates and initializes a reader page for the ring buffer to be swapped into the ring buffer for a new filled page for the reader. The problem is that there's no reason to actually free the page when it is passed back to the ring buffer. It can hold it off and reuse it for the next iteration. This completely removes the interaction with the page_alloc mechanism. Using the trace-cmd utility to record all events (causing trace-cmd to require reading lots of pages from the ring buffer, and calling ring_buffer_alloc/free_read_page() several times), and also assigning a stack trace trigger to the mm_page_alloc event, we can see how many times the ring_buffer_alloc_read_page() needed to allocate a page for the ring buffer. Before this change: # trace-cmd record -e all -e mem_page_alloc -R stacktrace sleep 1 # trace-cmd report |grep ring_buffer_alloc_read_page | wc -l 9968 After this change: # trace-cmd record -e all -e mem_page_alloc -R stacktrace sleep 1 # trace-cmd report |grep ring_buffer_alloc_read_page | wc -l 4 Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-05-01 21:35:09 +08:00
* @cpu: the cpu buffer the page came from
* @data: the page to free
*
* Free a page allocated from ring_buffer_alloc_read_page.
*/
void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data)
{
struct ring_buffer_per_cpu *cpu_buffer;
ring-buffer: Return reader page back into existing ring buffer When reading the ring buffer for consuming, it is optimized for splice, where a page is taken out of the ring buffer (zero copy) and sent to the reading consumer. When the read is finished with the page, it calls ring_buffer_free_read_page(), which simply frees the page. The next time the reader needs to get a page from the ring buffer, it must call ring_buffer_alloc_read_page() which allocates and initializes a reader page for the ring buffer to be swapped into the ring buffer for a new filled page for the reader. The problem is that there's no reason to actually free the page when it is passed back to the ring buffer. It can hold it off and reuse it for the next iteration. This completely removes the interaction with the page_alloc mechanism. Using the trace-cmd utility to record all events (causing trace-cmd to require reading lots of pages from the ring buffer, and calling ring_buffer_alloc/free_read_page() several times), and also assigning a stack trace trigger to the mm_page_alloc event, we can see how many times the ring_buffer_alloc_read_page() needed to allocate a page for the ring buffer. Before this change: # trace-cmd record -e all -e mem_page_alloc -R stacktrace sleep 1 # trace-cmd report |grep ring_buffer_alloc_read_page | wc -l 9968 After this change: # trace-cmd record -e all -e mem_page_alloc -R stacktrace sleep 1 # trace-cmd report |grep ring_buffer_alloc_read_page | wc -l 4 Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-05-01 21:35:09 +08:00
struct buffer_data_page *bpage = data;
struct page *page = virt_to_page(bpage);
ring-buffer: Return reader page back into existing ring buffer When reading the ring buffer for consuming, it is optimized for splice, where a page is taken out of the ring buffer (zero copy) and sent to the reading consumer. When the read is finished with the page, it calls ring_buffer_free_read_page(), which simply frees the page. The next time the reader needs to get a page from the ring buffer, it must call ring_buffer_alloc_read_page() which allocates and initializes a reader page for the ring buffer to be swapped into the ring buffer for a new filled page for the reader. The problem is that there's no reason to actually free the page when it is passed back to the ring buffer. It can hold it off and reuse it for the next iteration. This completely removes the interaction with the page_alloc mechanism. Using the trace-cmd utility to record all events (causing trace-cmd to require reading lots of pages from the ring buffer, and calling ring_buffer_alloc/free_read_page() several times), and also assigning a stack trace trigger to the mm_page_alloc event, we can see how many times the ring_buffer_alloc_read_page() needed to allocate a page for the ring buffer. Before this change: # trace-cmd record -e all -e mem_page_alloc -R stacktrace sleep 1 # trace-cmd report |grep ring_buffer_alloc_read_page | wc -l 9968 After this change: # trace-cmd record -e all -e mem_page_alloc -R stacktrace sleep 1 # trace-cmd report |grep ring_buffer_alloc_read_page | wc -l 4 Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-05-01 21:35:09 +08:00
unsigned long flags;
if (!buffer || !buffer->buffers || !buffer->buffers[cpu])
return;
cpu_buffer = buffer->buffers[cpu];
/* If the page is still in use someplace else, we can't reuse it */
if (page_ref_count(page) > 1)
goto out;
ring-buffer: Return reader page back into existing ring buffer When reading the ring buffer for consuming, it is optimized for splice, where a page is taken out of the ring buffer (zero copy) and sent to the reading consumer. When the read is finished with the page, it calls ring_buffer_free_read_page(), which simply frees the page. The next time the reader needs to get a page from the ring buffer, it must call ring_buffer_alloc_read_page() which allocates and initializes a reader page for the ring buffer to be swapped into the ring buffer for a new filled page for the reader. The problem is that there's no reason to actually free the page when it is passed back to the ring buffer. It can hold it off and reuse it for the next iteration. This completely removes the interaction with the page_alloc mechanism. Using the trace-cmd utility to record all events (causing trace-cmd to require reading lots of pages from the ring buffer, and calling ring_buffer_alloc/free_read_page() several times), and also assigning a stack trace trigger to the mm_page_alloc event, we can see how many times the ring_buffer_alloc_read_page() needed to allocate a page for the ring buffer. Before this change: # trace-cmd record -e all -e mem_page_alloc -R stacktrace sleep 1 # trace-cmd report |grep ring_buffer_alloc_read_page | wc -l 9968 After this change: # trace-cmd record -e all -e mem_page_alloc -R stacktrace sleep 1 # trace-cmd report |grep ring_buffer_alloc_read_page | wc -l 4 Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-05-01 21:35:09 +08:00
local_irq_save(flags);
arch_spin_lock(&cpu_buffer->lock);
if (!cpu_buffer->free_page) {
cpu_buffer->free_page = bpage;
bpage = NULL;
}
arch_spin_unlock(&cpu_buffer->lock);
local_irq_restore(flags);
out:
ring-buffer: Return reader page back into existing ring buffer When reading the ring buffer for consuming, it is optimized for splice, where a page is taken out of the ring buffer (zero copy) and sent to the reading consumer. When the read is finished with the page, it calls ring_buffer_free_read_page(), which simply frees the page. The next time the reader needs to get a page from the ring buffer, it must call ring_buffer_alloc_read_page() which allocates and initializes a reader page for the ring buffer to be swapped into the ring buffer for a new filled page for the reader. The problem is that there's no reason to actually free the page when it is passed back to the ring buffer. It can hold it off and reuse it for the next iteration. This completely removes the interaction with the page_alloc mechanism. Using the trace-cmd utility to record all events (causing trace-cmd to require reading lots of pages from the ring buffer, and calling ring_buffer_alloc/free_read_page() several times), and also assigning a stack trace trigger to the mm_page_alloc event, we can see how many times the ring_buffer_alloc_read_page() needed to allocate a page for the ring buffer. Before this change: # trace-cmd record -e all -e mem_page_alloc -R stacktrace sleep 1 # trace-cmd report |grep ring_buffer_alloc_read_page | wc -l 9968 After this change: # trace-cmd record -e all -e mem_page_alloc -R stacktrace sleep 1 # trace-cmd report |grep ring_buffer_alloc_read_page | wc -l 4 Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-05-01 21:35:09 +08:00
free_page((unsigned long)bpage);
}
EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
/**
* ring_buffer_read_page - extract a page from the ring buffer
* @buffer: buffer to extract from
* @data_page: the page to use allocated from ring_buffer_alloc_read_page
* @len: amount to extract
* @cpu: the cpu of the buffer to extract
* @full: should the extraction only happen when the page is full.
*
* This function will pull out a page from the ring buffer and consume it.
* @data_page must be the address of the variable that was returned
* from ring_buffer_alloc_read_page. This is because the page might be used
* to swap with a page in the ring buffer.
*
* for example:
* rpage = ring_buffer_alloc_read_page(buffer, cpu);
ring-buffer: Have ring_buffer_alloc_read_page() return error on offline CPU Chunyu Hu reported: "per_cpu trace directories and files are created for all possible cpus, but only the cpus which have ever been on-lined have their own per cpu ring buffer (allocated by cpuhp threads). While trace_buffers_open, the open handler for trace file 'trace_pipe_raw' is always trying to access field of ring_buffer_per_cpu, and would panic with the NULL pointer. Align the behavior of trace_pipe_raw with trace_pipe, that returns -NODEV when openning it if that cpu does not have trace ring buffer. Reproduce: cat /sys/kernel/debug/tracing/per_cpu/cpu31/trace_pipe_raw (cpu31 is never on-lined, this is a 16 cores x86_64 box) Tested with: 1) boot with maxcpus=14, read trace_pipe_raw of cpu15. Got -NODEV. 2) oneline cpu15, read trace_pipe_raw of cpu15. Get the raw trace data. Call trace: [ 5760.950995] RIP: 0010:ring_buffer_alloc_read_page+0x32/0xe0 [ 5760.961678] tracing_buffers_read+0x1f6/0x230 [ 5760.962695] __vfs_read+0x37/0x160 [ 5760.963498] ? __vfs_read+0x5/0x160 [ 5760.964339] ? security_file_permission+0x9d/0xc0 [ 5760.965451] ? __vfs_read+0x5/0x160 [ 5760.966280] vfs_read+0x8c/0x130 [ 5760.967070] SyS_read+0x55/0xc0 [ 5760.967779] do_syscall_64+0x67/0x150 [ 5760.968687] entry_SYSCALL64_slow_path+0x25/0x25" This was introduced by the addition of the feature to reuse reader pages instead of re-allocating them. The problem is that the allocation of a reader page (which is per cpu) does not check if the cpu is online and set up for the ring buffer. Link: http://lkml.kernel.org/r/1500880866-1177-1-git-send-email-chuhu@redhat.com Cc: stable@vger.kernel.org Fixes: 73a757e63114 ("ring-buffer: Return reader page back into existing ring buffer") Reported-by: Chunyu Hu <chuhu@redhat.com> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2017-08-03 02:20:54 +08:00
* if (IS_ERR(rpage))
* return PTR_ERR(rpage);
* ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
* if (ret >= 0)
* process_page(rpage, ret);
*
* When @full is set, the function will not return true unless
* the writer is off the reader page.
*
* Note: it is up to the calling functions to handle sleeps and wakeups.
* The ring buffer can be used anywhere in the kernel and can not
* blindly call wake_up. The layer that uses the ring buffer must be
* responsible for that.
*
* Returns:
* >=0 if data has been transferred, returns the offset of consumed data.
* <0 if no data has been transferred.
*/
int ring_buffer_read_page(struct trace_buffer *buffer,
void **data_page, size_t len, int cpu, int full)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
struct ring_buffer_event *event;
struct buffer_data_page *bpage;
struct buffer_page *reader;
unsigned long missed_events;
unsigned long flags;
unsigned int commit;
unsigned int read;
u64 save_timestamp;
int ret = -1;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
goto out;
/*
* If len is not big enough to hold the page header, then
* we can not copy anything.
*/
if (len <= BUF_PAGE_HDR_SIZE)
goto out;
len -= BUF_PAGE_HDR_SIZE;
if (!data_page)
goto out;
bpage = *data_page;
if (!bpage)
goto out;
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
reader = rb_get_reader_page(cpu_buffer);
if (!reader)
goto out_unlock;
event = rb_reader_event(cpu_buffer);
read = reader->read;
commit = rb_page_commit(reader);
ring-buffer: Add place holder recording of dropped events Currently, when the ring buffer drops events, it does not record the fact that it did so. It does inform the writer that the event was dropped by returning a NULL event, but it does not put in any place holder where the event was dropped. This is not a trivial thing to add because the ring buffer mostly runs in overwrite (flight recorder) mode. That is, when the ring buffer is full, new data will overwrite old data. In a produce/consumer mode, where new data is simply dropped when the ring buffer is full, it is trivial to add the placeholder for dropped events. When there's more room to write new data, then a special event can be added to notify the reader about the dropped events. But in overwrite mode, any new write can overwrite events. A place holder can not be inserted into the ring buffer since there never may be room. A reader could also come in at anytime and miss the placeholder. Luckily, the way the ring buffer works, the read side can find out if events were lost or not, and how many events. Everytime a write takes place, if it overwrites the header page (the next read) it updates a "overrun" variable that keeps track of the number of lost events. When a reader swaps out a page from the ring buffer, it can record this number, perfom the swap, and then check to see if the number changed, and take the diff if it has, which would be the number of events dropped. This can be stored by the reader and returned to callers of the reader. Since the reader page swap will fail if the writer moved the head page since the time the reader page set up the swap, this gives room to record the overruns without worrying about races. If the reader sets up the pages, records the overrun, than performs the swap, if the swap succeeds, then the overrun variable has not been updated since the setup before the swap. For binary readers of the ring buffer, a flag is set in the header of each sub page (sub buffer) of the ring buffer. This flag is embedded in the size field of the data on the sub buffer, in the 31st bit (the size can be 32 or 64 bits depending on the architecture), but only 27 bits needs to be used for the actual size (less actually). We could add a new field in the sub buffer header to also record the number of events dropped since the last read, but this will change the format of the binary ring buffer a bit too much. Perhaps this change can be made if the information on the number of events dropped is considered important enough. Note, the notification of dropped events is only used by consuming reads or peeking at the ring buffer. Iterating over the ring buffer does not keep this information because the necessary data is only available when a page swap is made, and the iterator does not swap out pages. Cc: Robert Richter <robert.richter@amd.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: "Luis Claudio R. Goncalves" <lclaudio@uudg.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-04-01 01:21:56 +08:00
/* Check if any events were dropped */
missed_events = cpu_buffer->lost_events;
ring-buffer: Add place holder recording of dropped events Currently, when the ring buffer drops events, it does not record the fact that it did so. It does inform the writer that the event was dropped by returning a NULL event, but it does not put in any place holder where the event was dropped. This is not a trivial thing to add because the ring buffer mostly runs in overwrite (flight recorder) mode. That is, when the ring buffer is full, new data will overwrite old data. In a produce/consumer mode, where new data is simply dropped when the ring buffer is full, it is trivial to add the placeholder for dropped events. When there's more room to write new data, then a special event can be added to notify the reader about the dropped events. But in overwrite mode, any new write can overwrite events. A place holder can not be inserted into the ring buffer since there never may be room. A reader could also come in at anytime and miss the placeholder. Luckily, the way the ring buffer works, the read side can find out if events were lost or not, and how many events. Everytime a write takes place, if it overwrites the header page (the next read) it updates a "overrun" variable that keeps track of the number of lost events. When a reader swaps out a page from the ring buffer, it can record this number, perfom the swap, and then check to see if the number changed, and take the diff if it has, which would be the number of events dropped. This can be stored by the reader and returned to callers of the reader. Since the reader page swap will fail if the writer moved the head page since the time the reader page set up the swap, this gives room to record the overruns without worrying about races. If the reader sets up the pages, records the overrun, than performs the swap, if the swap succeeds, then the overrun variable has not been updated since the setup before the swap. For binary readers of the ring buffer, a flag is set in the header of each sub page (sub buffer) of the ring buffer. This flag is embedded in the size field of the data on the sub buffer, in the 31st bit (the size can be 32 or 64 bits depending on the architecture), but only 27 bits needs to be used for the actual size (less actually). We could add a new field in the sub buffer header to also record the number of events dropped since the last read, but this will change the format of the binary ring buffer a bit too much. Perhaps this change can be made if the information on the number of events dropped is considered important enough. Note, the notification of dropped events is only used by consuming reads or peeking at the ring buffer. Iterating over the ring buffer does not keep this information because the necessary data is only available when a page swap is made, and the iterator does not swap out pages. Cc: Robert Richter <robert.richter@amd.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: "Luis Claudio R. Goncalves" <lclaudio@uudg.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-04-01 01:21:56 +08:00
/*
* If this page has been partially read or
* if len is not big enough to read the rest of the page or
* a writer is still on the page, then
* we must copy the data from the page to the buffer.
* Otherwise, we can simply swap the page with the one passed in.
*/
if (read || (len < (commit - read)) ||
cpu_buffer->reader_page == cpu_buffer->commit_page) {
struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
unsigned int rpos = read;
unsigned int pos = 0;
unsigned int size;
/*
* If a full page is expected, this can still be returned
* if there's been a previous partial read and the
* rest of the page can be read and the commit page is off
* the reader page.
*/
if (full &&
(!read || (len < (commit - read)) ||
cpu_buffer->reader_page == cpu_buffer->commit_page))
goto out_unlock;
if (len > (commit - read))
len = (commit - read);
ring-buffer: Bind time extend and data events together When the time between two timestamps is greater than 2^27 nanosecs (~134 ms) a time extend event is added that extends the time difference to 59 bits (~18 years). This is due to events only having a 27 bit field to store time. Currently this time extend is a separate event. We add it just before the event data that is being written to the buffer. But before the event data is committed, the event data can also be discarded (as with the case of filters). But because the time extend has already been committed, it will stay in the buffer. If lots of events are being filtered and no event is being written, then every 134ms a time extend can be added to the buffer without any data attached. To keep from filling the entire buffer with time extends, a time extend will never be the first event in a page because the page timestamp can be used. Time extends can only fill the rest of a page with some data at the beginning. This patch binds the time extend with the data. The difference here is that the time extend is not committed before the data is added. Instead, when a time extend is needed, the space reserved on the ring buffer is the time extend + the data event size. The time extend is added to the first part of the reserved block and the data is added to the second. The time extend event is passed back to the reserver, but since the reserver also uses a function to find the data portion of the reserved block, no changes to the ring buffer interface need to be made. When a commit is discarded, we now remove both the time extend and the event. With this approach no more than one time extend can be in the buffer in a row. Data must always follow a time extend. Thanks to Mathieu Desnoyers for suggesting this idea. Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-10-08 06:18:05 +08:00
/* Always keep the time extend and data together */
size = rb_event_ts_length(event);
if (len < size)
goto out_unlock;
/* save the current timestamp, since the user will need it */
save_timestamp = cpu_buffer->read_stamp;
/* Need to copy one event at a time */
do {
/* We need the size of one event, because
* rb_advance_reader only advances by one event,
* whereas rb_event_ts_length may include the size of
* one or two events.
* We have already ensured there's enough space if this
* is a time extend. */
size = rb_event_length(event);
memcpy(bpage->data + pos, rpage->data + rpos, size);
len -= size;
rb_advance_reader(cpu_buffer);
rpos = reader->read;
pos += size;
tracing: Fix ring_buffer_read_page reading out of page boundary With the configuration: CONFIG_DEBUG_PAGEALLOC=y and Shaohua's patch: [PATCH]x86: make spurious_fault check correct pte bit Function call graph trace with the following will trigger a page fault. # cd /sys/kernel/debug/tracing/ # echo function_graph > current_tracer # cat per_cpu/cpu1/trace_pipe_raw > /dev/null BUG: unable to handle kernel paging request at ffff880006e99000 IP: [<ffffffff81085572>] rb_event_length+0x1/0x3f PGD 1b19063 PUD 1b1d063 PMD 3f067 PTE 6e99160 Oops: 0000 [#1] SMP DEBUG_PAGEALLOC last sysfs file: /sys/devices/virtual/net/lo/operstate CPU 1 Modules linked in: Pid: 1982, comm: cat Not tainted 2.6.35-rc6-aes+ #300 /Bochs RIP: 0010:[<ffffffff81085572>] [<ffffffff81085572>] rb_event_length+0x1/0x3f RSP: 0018:ffff880006475e38 EFLAGS: 00010006 RAX: 0000000000000ff0 RBX: ffff88000786c630 RCX: 000000000000001d RDX: ffff880006e98000 RSI: 0000000000000ff0 RDI: ffff880006e99000 RBP: ffff880006475eb8 R08: 000000145d7008bd R09: 0000000000000000 R10: 0000000000008000 R11: ffffffff815d9336 R12: ffff880006d08000 R13: ffff880006e605d8 R14: 0000000000000000 R15: 0000000000000018 FS: 00007f2b83e456f0(0000) GS:ffff880002100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: ffff880006e99000 CR3: 00000000064a8000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process cat (pid: 1982, threadinfo ffff880006474000, task ffff880006e40770) Stack: ffff880006475eb8 ffffffff8108730f 0000000000000ff0 000000145d7008bd <0> ffff880006e98010 ffff880006d08010 0000000000000296 ffff88000786c640 <0> ffffffff81002956 0000000000000000 ffff8800071f4680 ffff8800071f4680 Call Trace: [<ffffffff8108730f>] ? ring_buffer_read_page+0x15a/0x24a [<ffffffff81002956>] ? return_to_handler+0x15/0x2f [<ffffffff8108a575>] tracing_buffers_read+0xb9/0x164 [<ffffffff810debfe>] vfs_read+0xaf/0x150 [<ffffffff81002941>] return_to_handler+0x0/0x2f [<ffffffff810248b0>] __bad_area_nosemaphore+0x17e/0x1a1 [<ffffffff81002941>] return_to_handler+0x0/0x2f [<ffffffff810248e6>] bad_area_nosemaphore+0x13/0x15 Code: 80 25 b2 16 b3 00 fe c9 c3 55 48 89 e5 f0 80 0d a4 16 b3 00 02 c9 c3 55 31 c0 48 89 e5 48 83 3d 94 16 b3 00 01 c9 0f 94 c0 c3 55 <8a> 0f 48 89 e5 83 e1 1f b8 08 00 00 00 0f b6 d1 83 fa 1e 74 27 RIP [<ffffffff81085572>] rb_event_length+0x1/0x3f RSP <ffff880006475e38> CR2: ffff880006e99000 ---[ end trace a6877bb92ccb36bb ]--- The root cause is that ring_buffer_read_page() may read out of page boundary, because the boundary checking is done after reading. This is fixed via doing boundary checking before reading. Reported-by: Shaohua Li <shaohua.li@intel.com> Cc: <stable@kernel.org> Signed-off-by: Huang Ying <ying.huang@intel.com> LKML-Reference: <1280297641.2771.307.camel@yhuang-dev> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-07-28 14:14:01 +08:00
if (rpos >= commit)
break;
event = rb_reader_event(cpu_buffer);
ring-buffer: Bind time extend and data events together When the time between two timestamps is greater than 2^27 nanosecs (~134 ms) a time extend event is added that extends the time difference to 59 bits (~18 years). This is due to events only having a 27 bit field to store time. Currently this time extend is a separate event. We add it just before the event data that is being written to the buffer. But before the event data is committed, the event data can also be discarded (as with the case of filters). But because the time extend has already been committed, it will stay in the buffer. If lots of events are being filtered and no event is being written, then every 134ms a time extend can be added to the buffer without any data attached. To keep from filling the entire buffer with time extends, a time extend will never be the first event in a page because the page timestamp can be used. Time extends can only fill the rest of a page with some data at the beginning. This patch binds the time extend with the data. The difference here is that the time extend is not committed before the data is added. Instead, when a time extend is needed, the space reserved on the ring buffer is the time extend + the data event size. The time extend is added to the first part of the reserved block and the data is added to the second. The time extend event is passed back to the reserver, but since the reserver also uses a function to find the data portion of the reserved block, no changes to the ring buffer interface need to be made. When a commit is discarded, we now remove both the time extend and the event. With this approach no more than one time extend can be in the buffer in a row. Data must always follow a time extend. Thanks to Mathieu Desnoyers for suggesting this idea. Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-10-08 06:18:05 +08:00
/* Always keep the time extend and data together */
size = rb_event_ts_length(event);
} while (len >= size);
/* update bpage */
local_set(&bpage->commit, pos);
bpage->time_stamp = save_timestamp;
/* we copied everything to the beginning */
read = 0;
} else {
/* update the entry counter */
ring-buffer: make lockless This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
2009-03-27 23:00:29 +08:00
cpu_buffer->read += rb_page_entries(reader);
cpu_buffer->read_bytes += rb_page_commit(reader);
/* swap the pages */
rb_init_page(bpage);
bpage = reader->page;
reader->page = *data_page;
local_set(&reader->write, 0);
local_set(&reader->entries, 0);
reader->read = 0;
*data_page = bpage;
/*
* Use the real_end for the data size,
* This gives us a chance to store the lost events
* on the page.
*/
if (reader->real_end)
local_set(&bpage->commit, reader->real_end);
}
ret = read;
ring-buffer: Add place holder recording of dropped events Currently, when the ring buffer drops events, it does not record the fact that it did so. It does inform the writer that the event was dropped by returning a NULL event, but it does not put in any place holder where the event was dropped. This is not a trivial thing to add because the ring buffer mostly runs in overwrite (flight recorder) mode. That is, when the ring buffer is full, new data will overwrite old data. In a produce/consumer mode, where new data is simply dropped when the ring buffer is full, it is trivial to add the placeholder for dropped events. When there's more room to write new data, then a special event can be added to notify the reader about the dropped events. But in overwrite mode, any new write can overwrite events. A place holder can not be inserted into the ring buffer since there never may be room. A reader could also come in at anytime and miss the placeholder. Luckily, the way the ring buffer works, the read side can find out if events were lost or not, and how many events. Everytime a write takes place, if it overwrites the header page (the next read) it updates a "overrun" variable that keeps track of the number of lost events. When a reader swaps out a page from the ring buffer, it can record this number, perfom the swap, and then check to see if the number changed, and take the diff if it has, which would be the number of events dropped. This can be stored by the reader and returned to callers of the reader. Since the reader page swap will fail if the writer moved the head page since the time the reader page set up the swap, this gives room to record the overruns without worrying about races. If the reader sets up the pages, records the overrun, than performs the swap, if the swap succeeds, then the overrun variable has not been updated since the setup before the swap. For binary readers of the ring buffer, a flag is set in the header of each sub page (sub buffer) of the ring buffer. This flag is embedded in the size field of the data on the sub buffer, in the 31st bit (the size can be 32 or 64 bits depending on the architecture), but only 27 bits needs to be used for the actual size (less actually). We could add a new field in the sub buffer header to also record the number of events dropped since the last read, but this will change the format of the binary ring buffer a bit too much. Perhaps this change can be made if the information on the number of events dropped is considered important enough. Note, the notification of dropped events is only used by consuming reads or peeking at the ring buffer. Iterating over the ring buffer does not keep this information because the necessary data is only available when a page swap is made, and the iterator does not swap out pages. Cc: Robert Richter <robert.richter@amd.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: "Luis Claudio R. Goncalves" <lclaudio@uudg.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-04-01 01:21:56 +08:00
cpu_buffer->lost_events = 0;
commit = local_read(&bpage->commit);
ring-buffer: Add place holder recording of dropped events Currently, when the ring buffer drops events, it does not record the fact that it did so. It does inform the writer that the event was dropped by returning a NULL event, but it does not put in any place holder where the event was dropped. This is not a trivial thing to add because the ring buffer mostly runs in overwrite (flight recorder) mode. That is, when the ring buffer is full, new data will overwrite old data. In a produce/consumer mode, where new data is simply dropped when the ring buffer is full, it is trivial to add the placeholder for dropped events. When there's more room to write new data, then a special event can be added to notify the reader about the dropped events. But in overwrite mode, any new write can overwrite events. A place holder can not be inserted into the ring buffer since there never may be room. A reader could also come in at anytime and miss the placeholder. Luckily, the way the ring buffer works, the read side can find out if events were lost or not, and how many events. Everytime a write takes place, if it overwrites the header page (the next read) it updates a "overrun" variable that keeps track of the number of lost events. When a reader swaps out a page from the ring buffer, it can record this number, perfom the swap, and then check to see if the number changed, and take the diff if it has, which would be the number of events dropped. This can be stored by the reader and returned to callers of the reader. Since the reader page swap will fail if the writer moved the head page since the time the reader page set up the swap, this gives room to record the overruns without worrying about races. If the reader sets up the pages, records the overrun, than performs the swap, if the swap succeeds, then the overrun variable has not been updated since the setup before the swap. For binary readers of the ring buffer, a flag is set in the header of each sub page (sub buffer) of the ring buffer. This flag is embedded in the size field of the data on the sub buffer, in the 31st bit (the size can be 32 or 64 bits depending on the architecture), but only 27 bits needs to be used for the actual size (less actually). We could add a new field in the sub buffer header to also record the number of events dropped since the last read, but this will change the format of the binary ring buffer a bit too much. Perhaps this change can be made if the information on the number of events dropped is considered important enough. Note, the notification of dropped events is only used by consuming reads or peeking at the ring buffer. Iterating over the ring buffer does not keep this information because the necessary data is only available when a page swap is made, and the iterator does not swap out pages. Cc: Robert Richter <robert.richter@amd.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: "Luis Claudio R. Goncalves" <lclaudio@uudg.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-04-01 01:21:56 +08:00
/*
* Set a flag in the commit field if we lost events
*/
if (missed_events) {
/* If there is room at the end of the page to save the
* missed events, then record it there.
*/
if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
memcpy(&bpage->data[commit], &missed_events,
sizeof(missed_events));
local_add(RB_MISSED_STORED, &bpage->commit);
commit += sizeof(missed_events);
}
ring-buffer: Add place holder recording of dropped events Currently, when the ring buffer drops events, it does not record the fact that it did so. It does inform the writer that the event was dropped by returning a NULL event, but it does not put in any place holder where the event was dropped. This is not a trivial thing to add because the ring buffer mostly runs in overwrite (flight recorder) mode. That is, when the ring buffer is full, new data will overwrite old data. In a produce/consumer mode, where new data is simply dropped when the ring buffer is full, it is trivial to add the placeholder for dropped events. When there's more room to write new data, then a special event can be added to notify the reader about the dropped events. But in overwrite mode, any new write can overwrite events. A place holder can not be inserted into the ring buffer since there never may be room. A reader could also come in at anytime and miss the placeholder. Luckily, the way the ring buffer works, the read side can find out if events were lost or not, and how many events. Everytime a write takes place, if it overwrites the header page (the next read) it updates a "overrun" variable that keeps track of the number of lost events. When a reader swaps out a page from the ring buffer, it can record this number, perfom the swap, and then check to see if the number changed, and take the diff if it has, which would be the number of events dropped. This can be stored by the reader and returned to callers of the reader. Since the reader page swap will fail if the writer moved the head page since the time the reader page set up the swap, this gives room to record the overruns without worrying about races. If the reader sets up the pages, records the overrun, than performs the swap, if the swap succeeds, then the overrun variable has not been updated since the setup before the swap. For binary readers of the ring buffer, a flag is set in the header of each sub page (sub buffer) of the ring buffer. This flag is embedded in the size field of the data on the sub buffer, in the 31st bit (the size can be 32 or 64 bits depending on the architecture), but only 27 bits needs to be used for the actual size (less actually). We could add a new field in the sub buffer header to also record the number of events dropped since the last read, but this will change the format of the binary ring buffer a bit too much. Perhaps this change can be made if the information on the number of events dropped is considered important enough. Note, the notification of dropped events is only used by consuming reads or peeking at the ring buffer. Iterating over the ring buffer does not keep this information because the necessary data is only available when a page swap is made, and the iterator does not swap out pages. Cc: Robert Richter <robert.richter@amd.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: "Luis Claudio R. Goncalves" <lclaudio@uudg.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-04-01 01:21:56 +08:00
local_add(RB_MISSED_EVENTS, &bpage->commit);
}
ring-buffer: Add place holder recording of dropped events Currently, when the ring buffer drops events, it does not record the fact that it did so. It does inform the writer that the event was dropped by returning a NULL event, but it does not put in any place holder where the event was dropped. This is not a trivial thing to add because the ring buffer mostly runs in overwrite (flight recorder) mode. That is, when the ring buffer is full, new data will overwrite old data. In a produce/consumer mode, where new data is simply dropped when the ring buffer is full, it is trivial to add the placeholder for dropped events. When there's more room to write new data, then a special event can be added to notify the reader about the dropped events. But in overwrite mode, any new write can overwrite events. A place holder can not be inserted into the ring buffer since there never may be room. A reader could also come in at anytime and miss the placeholder. Luckily, the way the ring buffer works, the read side can find out if events were lost or not, and how many events. Everytime a write takes place, if it overwrites the header page (the next read) it updates a "overrun" variable that keeps track of the number of lost events. When a reader swaps out a page from the ring buffer, it can record this number, perfom the swap, and then check to see if the number changed, and take the diff if it has, which would be the number of events dropped. This can be stored by the reader and returned to callers of the reader. Since the reader page swap will fail if the writer moved the head page since the time the reader page set up the swap, this gives room to record the overruns without worrying about races. If the reader sets up the pages, records the overrun, than performs the swap, if the swap succeeds, then the overrun variable has not been updated since the setup before the swap. For binary readers of the ring buffer, a flag is set in the header of each sub page (sub buffer) of the ring buffer. This flag is embedded in the size field of the data on the sub buffer, in the 31st bit (the size can be 32 or 64 bits depending on the architecture), but only 27 bits needs to be used for the actual size (less actually). We could add a new field in the sub buffer header to also record the number of events dropped since the last read, but this will change the format of the binary ring buffer a bit too much. Perhaps this change can be made if the information on the number of events dropped is considered important enough. Note, the notification of dropped events is only used by consuming reads or peeking at the ring buffer. Iterating over the ring buffer does not keep this information because the necessary data is only available when a page swap is made, and the iterator does not swap out pages. Cc: Robert Richter <robert.richter@amd.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: "Luis Claudio R. Goncalves" <lclaudio@uudg.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2010-04-01 01:21:56 +08:00
/*
* This page may be off to user land. Zero it out here.
*/
if (commit < BUF_PAGE_SIZE)
memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
out_unlock:
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
out:
return ret;
}
EXPORT_SYMBOL_GPL(ring_buffer_read_page);
/*
* We only allocate new buffers, never free them if the CPU goes down.
* If we were to free the buffer, then the user would lose any trace that was in
* the buffer.
*/
int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node)
{
struct trace_buffer *buffer;
ring-buffer: Use long for nr_pages to avoid overflow failures The size variable to change the ring buffer in ftrace is a long. The nr_pages used to update the ring buffer based on the size is int. On 64 bit machines this can cause an overflow problem. For example, the following will cause the ring buffer to crash: # cd /sys/kernel/debug/tracing # echo 10 > buffer_size_kb # echo 8556384240 > buffer_size_kb Then you get the warning of: WARNING: CPU: 1 PID: 318 at kernel/trace/ring_buffer.c:1527 rb_update_pages+0x22f/0x260 Which is: RB_WARN_ON(cpu_buffer, nr_removed); Note each ring buffer page holds 4080 bytes. This is because: 1) 10 causes the ring buffer to have 3 pages. (10kb requires 3 * 4080 pages to hold) 2) (2^31 / 2^10 + 1) * 4080 = 8556384240 The value written into buffer_size_kb is shifted by 10 and then passed to ring_buffer_resize(). 8556384240 * 2^10 = 8761737461760 3) The size passed to ring_buffer_resize() is then divided by BUF_PAGE_SIZE which is 4080. 8761737461760 / 4080 = 2147484672 4) nr_pages is subtracted from the current nr_pages (3) and we get: 2147484669. This value is saved in a signed integer nr_pages_to_update 5) 2147484669 is greater than 2^31 but smaller than 2^32, a signed int turns into the value of -2147482627 6) As the value is a negative number, in update_pages_handler() it is negated and passed to rb_remove_pages() and 2147482627 pages will be removed, which is much larger than 3 and it causes the warning because not all the pages asked to be removed were removed. Link: https://bugzilla.kernel.org/show_bug.cgi?id=118001 Cc: stable@vger.kernel.org # 2.6.28+ Fixes: 7a8e76a3829f1 ("tracing: unified trace buffer") Reported-by: Hao Qin <QEver.cn@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2016-05-12 23:01:24 +08:00
long nr_pages_same;
int cpu_i;
unsigned long nr_pages;
buffer = container_of(node, struct trace_buffer, node);
if (cpumask_test_cpu(cpu, buffer->cpumask))
return 0;
nr_pages = 0;
nr_pages_same = 1;
/* check if all cpu sizes are same */
for_each_buffer_cpu(buffer, cpu_i) {
/* fill in the size from first enabled cpu */
if (nr_pages == 0)
nr_pages = buffer->buffers[cpu_i]->nr_pages;
if (nr_pages != buffer->buffers[cpu_i]->nr_pages) {
nr_pages_same = 0;
break;
}
}
/* allocate minimum pages, user can later expand it */
if (!nr_pages_same)
nr_pages = 2;
buffer->buffers[cpu] =
rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
if (!buffer->buffers[cpu]) {
WARN(1, "failed to allocate ring buffer on CPU %u\n",
cpu);
return -ENOMEM;
}
smp_wmb();
cpumask_set_cpu(cpu, buffer->cpumask);
return 0;
}
#ifdef CONFIG_RING_BUFFER_STARTUP_TEST
/*
* This is a basic integrity check of the ring buffer.
* Late in the boot cycle this test will run when configured in.
* It will kick off a thread per CPU that will go into a loop
* writing to the per cpu ring buffer various sizes of data.
* Some of the data will be large items, some small.
*
* Another thread is created that goes into a spin, sending out
* IPIs to the other CPUs to also write into the ring buffer.
* this is to test the nesting ability of the buffer.
*
* Basic stats are recorded and reported. If something in the
* ring buffer should happen that's not expected, a big warning
* is displayed and all ring buffers are disabled.
*/
static struct task_struct *rb_threads[NR_CPUS] __initdata;
struct rb_test_data {
struct trace_buffer *buffer;
unsigned long events;
unsigned long bytes_written;
unsigned long bytes_alloc;
unsigned long bytes_dropped;
unsigned long events_nested;
unsigned long bytes_written_nested;
unsigned long bytes_alloc_nested;
unsigned long bytes_dropped_nested;
int min_size_nested;
int max_size_nested;
int max_size;
int min_size;
int cpu;
int cnt;
};
static struct rb_test_data rb_data[NR_CPUS] __initdata;
/* 1 meg per cpu */
#define RB_TEST_BUFFER_SIZE 1048576
static char rb_string[] __initdata =
"abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()?+\\"
"?+|:';\",.<>/?abcdefghijklmnopqrstuvwxyz1234567890"
"!@#$%^&*()?+\\?+|:';\",.<>/?abcdefghijklmnopqrstuv";
static bool rb_test_started __initdata;
struct rb_item {
int size;
char str[];
};
static __init int rb_write_something(struct rb_test_data *data, bool nested)
{
struct ring_buffer_event *event;
struct rb_item *item;
bool started;
int event_len;
int size;
int len;
int cnt;
/* Have nested writes different that what is written */
cnt = data->cnt + (nested ? 27 : 0);
/* Multiply cnt by ~e, to make some unique increment */
size = (cnt * 68 / 25) % (sizeof(rb_string) - 1);
len = size + sizeof(struct rb_item);
started = rb_test_started;
/* read rb_test_started before checking buffer enabled */
smp_rmb();
event = ring_buffer_lock_reserve(data->buffer, len);
if (!event) {
/* Ignore dropped events before test starts. */
if (started) {
if (nested)
data->bytes_dropped += len;
else
data->bytes_dropped_nested += len;
}
return len;
}
event_len = ring_buffer_event_length(event);
if (RB_WARN_ON(data->buffer, event_len < len))
goto out;
item = ring_buffer_event_data(event);
item->size = size;
memcpy(item->str, rb_string, size);
if (nested) {
data->bytes_alloc_nested += event_len;
data->bytes_written_nested += len;
data->events_nested++;
if (!data->min_size_nested || len < data->min_size_nested)
data->min_size_nested = len;
if (len > data->max_size_nested)
data->max_size_nested = len;
} else {
data->bytes_alloc += event_len;
data->bytes_written += len;
data->events++;
if (!data->min_size || len < data->min_size)
data->max_size = len;
if (len > data->max_size)
data->max_size = len;
}
out:
ring_buffer_unlock_commit(data->buffer);
return 0;
}
static __init int rb_test(void *arg)
{
struct rb_test_data *data = arg;
while (!kthread_should_stop()) {
rb_write_something(data, false);
data->cnt++;
set_current_state(TASK_INTERRUPTIBLE);
/* Now sleep between a min of 100-300us and a max of 1ms */
usleep_range(((data->cnt % 3) + 1) * 100, 1000);
}
return 0;
}
static __init void rb_ipi(void *ignore)
{
struct rb_test_data *data;
int cpu = smp_processor_id();
data = &rb_data[cpu];
rb_write_something(data, true);
}
static __init int rb_hammer_test(void *arg)
{
while (!kthread_should_stop()) {
/* Send an IPI to all cpus to write data! */
smp_call_function(rb_ipi, NULL, 1);
/* No sleep, but for non preempt, let others run */
schedule();
}
return 0;
}
static __init int test_ringbuffer(void)
{
struct task_struct *rb_hammer;
struct trace_buffer *buffer;
int cpu;
int ret = 0;
if (security_locked_down(LOCKDOWN_TRACEFS)) {
pr_warn("Lockdown is enabled, skipping ring buffer tests\n");
return 0;
}
pr_info("Running ring buffer tests...\n");
buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE);
if (WARN_ON(!buffer))
return 0;
/* Disable buffer so that threads can't write to it yet */
ring_buffer_record_off(buffer);
for_each_online_cpu(cpu) {
rb_data[cpu].buffer = buffer;
rb_data[cpu].cpu = cpu;
rb_data[cpu].cnt = cpu;
rb_threads[cpu] = kthread_run_on_cpu(rb_test, &rb_data[cpu],
cpu, "rbtester/%u");
if (WARN_ON(IS_ERR(rb_threads[cpu]))) {
pr_cont("FAILED\n");
ret = PTR_ERR(rb_threads[cpu]);
goto out_free;
}
}
/* Now create the rb hammer! */
rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer");
if (WARN_ON(IS_ERR(rb_hammer))) {
pr_cont("FAILED\n");
ret = PTR_ERR(rb_hammer);
goto out_free;
}
ring_buffer_record_on(buffer);
/*
* Show buffer is enabled before setting rb_test_started.
* Yes there's a small race window where events could be
* dropped and the thread wont catch it. But when a ring
* buffer gets enabled, there will always be some kind of
* delay before other CPUs see it. Thus, we don't care about
* those dropped events. We care about events dropped after
* the threads see that the buffer is active.
*/
smp_wmb();
rb_test_started = true;
set_current_state(TASK_INTERRUPTIBLE);
/* Just run for 10 seconds */;
schedule_timeout(10 * HZ);
kthread_stop(rb_hammer);
out_free:
for_each_online_cpu(cpu) {
if (!rb_threads[cpu])
break;
kthread_stop(rb_threads[cpu]);
}
if (ret) {
ring_buffer_free(buffer);
return ret;
}
/* Report! */
pr_info("finished\n");
for_each_online_cpu(cpu) {
struct ring_buffer_event *event;
struct rb_test_data *data = &rb_data[cpu];
struct rb_item *item;
unsigned long total_events;
unsigned long total_dropped;
unsigned long total_written;
unsigned long total_alloc;
unsigned long total_read = 0;
unsigned long total_size = 0;
unsigned long total_len = 0;
unsigned long total_lost = 0;
unsigned long lost;
int big_event_size;
int small_event_size;
ret = -1;
total_events = data->events + data->events_nested;
total_written = data->bytes_written + data->bytes_written_nested;
total_alloc = data->bytes_alloc + data->bytes_alloc_nested;
total_dropped = data->bytes_dropped + data->bytes_dropped_nested;
big_event_size = data->max_size + data->max_size_nested;
small_event_size = data->min_size + data->min_size_nested;
pr_info("CPU %d:\n", cpu);
pr_info(" events: %ld\n", total_events);
pr_info(" dropped bytes: %ld\n", total_dropped);
pr_info(" alloced bytes: %ld\n", total_alloc);
pr_info(" written bytes: %ld\n", total_written);
pr_info(" biggest event: %d\n", big_event_size);
pr_info(" smallest event: %d\n", small_event_size);
if (RB_WARN_ON(buffer, total_dropped))
break;
ret = 0;
while ((event = ring_buffer_consume(buffer, cpu, NULL, &lost))) {
total_lost += lost;
item = ring_buffer_event_data(event);
total_len += ring_buffer_event_length(event);
total_size += item->size + sizeof(struct rb_item);
if (memcmp(&item->str[0], rb_string, item->size) != 0) {
pr_info("FAILED!\n");
pr_info("buffer had: %.*s\n", item->size, item->str);
pr_info("expected: %.*s\n", item->size, rb_string);
RB_WARN_ON(buffer, 1);
ret = -1;
break;
}
total_read++;
}
if (ret)
break;
ret = -1;
pr_info(" read events: %ld\n", total_read);
pr_info(" lost events: %ld\n", total_lost);
pr_info(" total events: %ld\n", total_lost + total_read);
pr_info(" recorded len bytes: %ld\n", total_len);
pr_info(" recorded size bytes: %ld\n", total_size);
if (total_lost) {
pr_info(" With dropped events, record len and size may not match\n"
" alloced and written from above\n");
} else {
if (RB_WARN_ON(buffer, total_len != total_alloc ||
total_size != total_written))
break;
}
if (RB_WARN_ON(buffer, total_lost + total_read != total_events))
break;
ret = 0;
}
if (!ret)
pr_info("Ring buffer PASSED!\n");
ring_buffer_free(buffer);
return 0;
}
late_initcall(test_ringbuffer);
#endif /* CONFIG_RING_BUFFER_STARTUP_TEST */