linux/kernel/relay.c

1248 lines
30 KiB
C
Raw Normal View History

/*
* Public API and common code for kernel->userspace relay file support.
*
* See Documentation/filesystems/relay.rst for an overview.
*
* Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
* Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
*
* Moved to kernel/relay.c by Paul Mundt, 2006.
* November 2006 - CPU hotplug support by Mathieu Desnoyers
* (mathieu.desnoyers@polymtl.ca)
*
* This file is released under the GPL.
*/
#include <linux/errno.h>
#include <linux/stddef.h>
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/string.h>
#include <linux/relay.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/splice.h>
/* list of open channels, for cpu hotplug */
static DEFINE_MUTEX(relay_channels_mutex);
static LIST_HEAD(relay_channels);
/*
* fault() vm_op implementation for relay file mapping.
*/
static vm_fault_t relay_buf_fault(struct vm_fault *vmf)
{
struct page *page;
struct rchan_buf *buf = vmf->vma->vm_private_data;
pgoff_t pgoff = vmf->pgoff;
if (!buf)
return VM_FAULT_OOM;
page = vmalloc_to_page(buf->start + (pgoff << PAGE_SHIFT));
if (!page)
return VM_FAULT_SIGBUS;
get_page(page);
vmf->page = page;
return 0;
}
/*
* vm_ops for relay file mappings.
*/
static const struct vm_operations_struct relay_file_mmap_ops = {
.fault = relay_buf_fault,
};
/*
* allocate an array of pointers of struct page
*/
static struct page **relay_alloc_page_array(unsigned int n_pages)
{
return kvcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
}
/*
* free an array of pointers of struct page
*/
static void relay_free_page_array(struct page **array)
{
kvfree(array);
}
/**
* relay_mmap_buf: - mmap channel buffer to process address space
* @buf: relay channel buffer
* @vma: vm_area_struct describing memory to be mapped
*
* Returns 0 if ok, negative on error
*
* Caller should already have grabbed mmap_lock.
*/
static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
{
unsigned long length = vma->vm_end - vma->vm_start;
if (!buf)
return -EBADF;
if (length != (unsigned long)buf->chan->alloc_size)
return -EINVAL;
vma->vm_ops = &relay_file_mmap_ops;
mm: replace vma->vm_flags direct modifications with modifier calls Replace direct modifications to vma->vm_flags with calls to modifier functions to be able to track flag changes and to keep vma locking correctness. [akpm@linux-foundation.org: fix drivers/misc/open-dice.c, per Hyeonggon Yoo] Link: https://lkml.kernel.org/r/20230126193752.297968-5-surenb@google.com Signed-off-by: Suren Baghdasaryan <surenb@google.com> Acked-by: Michal Hocko <mhocko@suse.com> Acked-by: Mel Gorman <mgorman@techsingularity.net> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> Acked-by: Sebastian Reichel <sebastian.reichel@collabora.com> Reviewed-by: Liam R. Howlett <Liam.Howlett@Oracle.com> Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Andy Lutomirski <luto@kernel.org> Cc: Arjun Roy <arjunroy@google.com> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: David Hildenbrand <david@redhat.com> Cc: David Howells <dhowells@redhat.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: David Rientjes <rientjes@google.com> Cc: Eric Dumazet <edumazet@google.com> Cc: Greg Thelen <gthelen@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jann Horn <jannh@google.com> Cc: Joel Fernandes <joelaf@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Kent Overstreet <kent.overstreet@linux.dev> Cc: Laurent Dufour <ldufour@linux.ibm.com> Cc: Lorenzo Stoakes <lstoakes@gmail.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Minchan Kim <minchan@google.com> Cc: Paul E. McKenney <paulmck@kernel.org> Cc: Peter Oskolkov <posk@google.com> Cc: Peter Xu <peterx@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Punit Agrawal <punit.agrawal@bytedance.com> Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Cc: Shakeel Butt <shakeelb@google.com> Cc: Soheil Hassas Yeganeh <soheil@google.com> Cc: Song Liu <songliubraving@fb.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-01-27 03:37:49 +08:00
vm_flags_set(vma, VM_DONTEXPAND);
vma->vm_private_data = buf;
return 0;
}
/**
* relay_alloc_buf - allocate a channel buffer
* @buf: the buffer struct
* @size: total size of the buffer
*
* Returns a pointer to the resulting buffer, %NULL if unsuccessful. The
* passed in size will get page aligned, if it isn't already.
*/
static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
{
void *mem;
unsigned int i, j, n_pages;
*size = PAGE_ALIGN(*size);
n_pages = *size >> PAGE_SHIFT;
buf->page_array = relay_alloc_page_array(n_pages);
if (!buf->page_array)
return NULL;
for (i = 0; i < n_pages; i++) {
buf->page_array[i] = alloc_page(GFP_KERNEL);
if (unlikely(!buf->page_array[i]))
goto depopulate;
set_page_private(buf->page_array[i], (unsigned long)buf);
}
mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL);
if (!mem)
goto depopulate;
memset(mem, 0, *size);
buf->page_count = n_pages;
return mem;
depopulate:
for (j = 0; j < i; j++)
__free_page(buf->page_array[j]);
relay_free_page_array(buf->page_array);
return NULL;
}
/**
* relay_create_buf - allocate and initialize a channel buffer
* @chan: the relay channel
*
* Returns channel buffer if successful, %NULL otherwise.
*/
static struct rchan_buf *relay_create_buf(struct rchan *chan)
{
struct rchan_buf *buf;
if (chan->n_subbufs > KMALLOC_MAX_SIZE / sizeof(size_t))
return NULL;
buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
if (!buf)
return NULL;
buf->padding = kmalloc_array(chan->n_subbufs, sizeof(size_t),
treewide: kmalloc() -> kmalloc_array() The kmalloc() function has a 2-factor argument form, kmalloc_array(). This patch replaces cases of: kmalloc(a * b, gfp) with: kmalloc_array(a * b, gfp) as well as handling cases of: kmalloc(a * b * c, gfp) with: kmalloc(array3_size(a, b, c), gfp) as it's slightly less ugly than: kmalloc_array(array_size(a, b), c, gfp) This does, however, attempt to ignore constant size factors like: kmalloc(4 * 1024, gfp) though any constants defined via macros get caught up in the conversion. Any factors with a sizeof() of "unsigned char", "char", and "u8" were dropped, since they're redundant. The tools/ directory was manually excluded, since it has its own implementation of kmalloc(). The Coccinelle script used for this was: // Fix redundant parens around sizeof(). @@ type TYPE; expression THING, E; @@ ( kmalloc( - (sizeof(TYPE)) * E + sizeof(TYPE) * E , ...) | kmalloc( - (sizeof(THING)) * E + sizeof(THING) * E , ...) ) // Drop single-byte sizes and redundant parens. @@ expression COUNT; typedef u8; typedef __u8; @@ ( kmalloc( - sizeof(u8) * (COUNT) + COUNT , ...) | kmalloc( - sizeof(__u8) * (COUNT) + COUNT , ...) | kmalloc( - sizeof(char) * (COUNT) + COUNT , ...) | kmalloc( - sizeof(unsigned char) * (COUNT) + COUNT , ...) | kmalloc( - sizeof(u8) * COUNT + COUNT , ...) | kmalloc( - sizeof(__u8) * COUNT + COUNT , ...) | kmalloc( - sizeof(char) * COUNT + COUNT , ...) | kmalloc( - sizeof(unsigned char) * COUNT + COUNT , ...) ) // 2-factor product with sizeof(type/expression) and identifier or constant. @@ type TYPE; expression THING; identifier COUNT_ID; constant COUNT_CONST; @@ ( - kmalloc + kmalloc_array ( - sizeof(TYPE) * (COUNT_ID) + COUNT_ID, sizeof(TYPE) , ...) | - kmalloc + kmalloc_array ( - sizeof(TYPE) * COUNT_ID + COUNT_ID, sizeof(TYPE) , ...) | - kmalloc + kmalloc_array ( - sizeof(TYPE) * (COUNT_CONST) + COUNT_CONST, sizeof(TYPE) , ...) | - kmalloc + kmalloc_array ( - sizeof(TYPE) * COUNT_CONST + COUNT_CONST, sizeof(TYPE) , ...) | - kmalloc + kmalloc_array ( - sizeof(THING) * (COUNT_ID) + COUNT_ID, sizeof(THING) , ...) | - kmalloc + kmalloc_array ( - sizeof(THING) * COUNT_ID + COUNT_ID, sizeof(THING) , ...) | - kmalloc + kmalloc_array ( - sizeof(THING) * (COUNT_CONST) + COUNT_CONST, sizeof(THING) , ...) | - kmalloc + kmalloc_array ( - sizeof(THING) * COUNT_CONST + COUNT_CONST, sizeof(THING) , ...) ) // 2-factor product, only identifiers. @@ identifier SIZE, COUNT; @@ - kmalloc + kmalloc_array ( - SIZE * COUNT + COUNT, SIZE , ...) // 3-factor product with 1 sizeof(type) or sizeof(expression), with // redundant parens removed. @@ expression THING; identifier STRIDE, COUNT; type TYPE; @@ ( kmalloc( - sizeof(TYPE) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kmalloc( - sizeof(TYPE) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kmalloc( - sizeof(TYPE) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kmalloc( - sizeof(TYPE) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kmalloc( - sizeof(THING) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | kmalloc( - sizeof(THING) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | kmalloc( - sizeof(THING) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | kmalloc( - sizeof(THING) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) ) // 3-factor product with 2 sizeof(variable), with redundant parens removed. @@ expression THING1, THING2; identifier COUNT; type TYPE1, TYPE2; @@ ( kmalloc( - sizeof(TYPE1) * sizeof(TYPE2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | kmalloc( - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | kmalloc( - sizeof(THING1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | kmalloc( - sizeof(THING1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | kmalloc( - sizeof(TYPE1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) | kmalloc( - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) ) // 3-factor product, only identifiers, with redundant parens removed. @@ identifier STRIDE, SIZE, COUNT; @@ ( kmalloc( - (COUNT) * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | kmalloc( - COUNT * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | kmalloc( - COUNT * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kmalloc( - (COUNT) * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | kmalloc( - COUNT * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kmalloc( - (COUNT) * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kmalloc( - (COUNT) * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kmalloc( - COUNT * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) ) // Any remaining multi-factor products, first at least 3-factor products, // when they're not all constants... @@ expression E1, E2, E3; constant C1, C2, C3; @@ ( kmalloc(C1 * C2 * C3, ...) | kmalloc( - (E1) * E2 * E3 + array3_size(E1, E2, E3) , ...) | kmalloc( - (E1) * (E2) * E3 + array3_size(E1, E2, E3) , ...) | kmalloc( - (E1) * (E2) * (E3) + array3_size(E1, E2, E3) , ...) | kmalloc( - E1 * E2 * E3 + array3_size(E1, E2, E3) , ...) ) // And then all remaining 2 factors products when they're not all constants, // keeping sizeof() as the second factor argument. @@ expression THING, E1, E2; type TYPE; constant C1, C2, C3; @@ ( kmalloc(sizeof(THING) * C2, ...) | kmalloc(sizeof(TYPE) * C2, ...) | kmalloc(C1 * C2 * C3, ...) | kmalloc(C1 * C2, ...) | - kmalloc + kmalloc_array ( - sizeof(TYPE) * (E2) + E2, sizeof(TYPE) , ...) | - kmalloc + kmalloc_array ( - sizeof(TYPE) * E2 + E2, sizeof(TYPE) , ...) | - kmalloc + kmalloc_array ( - sizeof(THING) * (E2) + E2, sizeof(THING) , ...) | - kmalloc + kmalloc_array ( - sizeof(THING) * E2 + E2, sizeof(THING) , ...) | - kmalloc + kmalloc_array ( - (E1) * E2 + E1, E2 , ...) | - kmalloc + kmalloc_array ( - (E1) * (E2) + E1, E2 , ...) | - kmalloc + kmalloc_array ( - E1 * E2 + E1, E2 , ...) ) Signed-off-by: Kees Cook <keescook@chromium.org>
2018-06-13 04:55:00 +08:00
GFP_KERNEL);
if (!buf->padding)
goto free_buf;
buf->start = relay_alloc_buf(buf, &chan->alloc_size);
if (!buf->start)
goto free_buf;
buf->chan = chan;
kref_get(&buf->chan->kref);
return buf;
free_buf:
kfree(buf->padding);
kfree(buf);
return NULL;
}
/**
* relay_destroy_channel - free the channel struct
* @kref: target kernel reference that contains the relay channel
*
* Should only be called from kref_put().
*/
static void relay_destroy_channel(struct kref *kref)
{
struct rchan *chan = container_of(kref, struct rchan, kref);
kernel/relay.c: fix memleak on destroy relay channel kmemleak report memory leak as follows: unreferenced object 0x607ee4e5f948 (size 8): comm "syz-executor.1", pid 2098, jiffies 4295031601 (age 288.468s) hex dump (first 8 bytes): 00 00 00 00 00 00 00 00 ........ backtrace: relay_open kernel/relay.c:583 [inline] relay_open+0xb6/0x970 kernel/relay.c:563 do_blk_trace_setup+0x4a8/0xb20 kernel/trace/blktrace.c:557 __blk_trace_setup+0xb6/0x150 kernel/trace/blktrace.c:597 blk_trace_ioctl+0x146/0x280 kernel/trace/blktrace.c:738 blkdev_ioctl+0xb2/0x6a0 block/ioctl.c:613 block_ioctl+0xe5/0x120 fs/block_dev.c:1871 vfs_ioctl fs/ioctl.c:48 [inline] __do_sys_ioctl fs/ioctl.c:753 [inline] __se_sys_ioctl fs/ioctl.c:739 [inline] __x64_sys_ioctl+0x170/0x1ce fs/ioctl.c:739 do_syscall_64+0x33/0x40 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 'chan->buf' is malloced in relay_open() by alloc_percpu() but not free while destroy the relay channel. Fix it by adding free_percpu() before return from relay_destroy_channel(). Fixes: 017c59c042d0 ("relay: Use per CPU constructs for the relay channel buffer pointers") Reported-by: Hulk Robot <hulkci@huawei.com> Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: David Rientjes <rientjes@google.com> Cc: Michel Lespinasse <walken@google.com> Cc: Daniel Axtens <dja@axtens.net> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Akash Goel <akash.goel@intel.com> Cc: <stable@vger.kernel.org> Link: http://lkml.kernel.org/r/20200817122826.48518-1-weiyongjun1@huawei.com Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-08-21 08:42:14 +08:00
free_percpu(chan->buf);
kfree(chan);
}
/**
* relay_destroy_buf - destroy an rchan_buf struct and associated buffer
* @buf: the buffer struct
*/
static void relay_destroy_buf(struct rchan_buf *buf)
{
struct rchan *chan = buf->chan;
unsigned int i;
if (likely(buf->start)) {
vunmap(buf->start);
for (i = 0; i < buf->page_count; i++)
__free_page(buf->page_array[i]);
relay_free_page_array(buf->page_array);
}
*per_cpu_ptr(chan->buf, buf->cpu) = NULL;
kfree(buf->padding);
kfree(buf);
kref_put(&chan->kref, relay_destroy_channel);
}
/**
* relay_remove_buf - remove a channel buffer
* @kref: target kernel reference that contains the relay buffer
*
* Removes the file from the filesystem, which also frees the
* rchan_buf_struct and the channel buffer. Should only be called from
* kref_put().
*/
static void relay_remove_buf(struct kref *kref)
{
struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
relay_destroy_buf(buf);
}
/**
* relay_buf_empty - boolean, is the channel buffer empty?
* @buf: channel buffer
*
* Returns 1 if the buffer is empty, 0 otherwise.
*/
static int relay_buf_empty(struct rchan_buf *buf)
{
return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1;
}
/**
* relay_buf_full - boolean, is the channel buffer full?
* @buf: channel buffer
*
* Returns 1 if the buffer is full, 0 otherwise.
*/
int relay_buf_full(struct rchan_buf *buf)
{
size_t ready = buf->subbufs_produced - buf->subbufs_consumed;
return (ready >= buf->chan->n_subbufs) ? 1 : 0;
}
EXPORT_SYMBOL_GPL(relay_buf_full);
/*
* High-level relay kernel API and associated functions.
*/
static int relay_subbuf_start(struct rchan_buf *buf, void *subbuf,
void *prev_subbuf, size_t prev_padding)
{
if (!buf->chan->cb->subbuf_start)
return !relay_buf_full(buf);
return buf->chan->cb->subbuf_start(buf, subbuf,
prev_subbuf, prev_padding);
}
/**
* wakeup_readers - wake up readers waiting on a channel
relay: Use irq_work instead of plain timer for deferred wakeup Relay avoids calling wake_up_interruptible() for doing the wakeup of readers/consumers, waiting for the generation of new data, from the context of a process which produced the data. This is apparently done to prevent the possibility of a deadlock in case Scheduler itself is is generating data for the relay, after acquiring rq->lock. The following patch used a timer (to be scheduled at next jiffy), for delegating the wakeup to another context. commit 7c9cb38302e78d24e37f7d8a2ea7eed4ae5f2fa7 Author: Tom Zanussi <zanussi@comcast.net> Date: Wed May 9 02:34:01 2007 -0700 relay: use plain timer instead of delayed work relay doesn't need to use schedule_delayed_work() for waking readers when a simple timer will do. Scheduling a plain timer, at next jiffies boundary, to do the wakeup causes a significant wakeup latency for the Userspace client, which makes relay less suitable for the high-frequency low-payload use cases where the data gets generated at a very high rate, like multiple sub buffers getting filled within a milli second. Moreover the timer is re-scheduled on every newly produced sub buffer so the timer keeps getting pushed out if sub buffers are filled in a very quick succession (less than a jiffy gap between filling of 2 sub buffers). As a result relay runs out of sub buffers to store the new data. By using irq_work it is ensured that wakeup of userspace client, blocked in the poll call, is done at earliest (through self IPI or next timer tick) enabling it to always consume the data in time. Also this makes relay consistent with printk & ring buffers (trace), as they too use irq_work for deferred wake up of readers. [arnd@arndb.de: select CONFIG_IRQ_WORK] Link: http://lkml.kernel.org/r/20160912154035.3222156-1-arnd@arndb.de [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/1472906487-1559-1-git-send-email-akash.goel@intel.com Signed-off-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Akash Goel <akash.goel@intel.com> Cc: Tom Zanussi <tzanussi@gmail.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Signed-off-by: Arnd Bergmann <arnd@arndb.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-10-12 04:54:33 +08:00
* @work: contains the channel buffer
*
relay: Use irq_work instead of plain timer for deferred wakeup Relay avoids calling wake_up_interruptible() for doing the wakeup of readers/consumers, waiting for the generation of new data, from the context of a process which produced the data. This is apparently done to prevent the possibility of a deadlock in case Scheduler itself is is generating data for the relay, after acquiring rq->lock. The following patch used a timer (to be scheduled at next jiffy), for delegating the wakeup to another context. commit 7c9cb38302e78d24e37f7d8a2ea7eed4ae5f2fa7 Author: Tom Zanussi <zanussi@comcast.net> Date: Wed May 9 02:34:01 2007 -0700 relay: use plain timer instead of delayed work relay doesn't need to use schedule_delayed_work() for waking readers when a simple timer will do. Scheduling a plain timer, at next jiffies boundary, to do the wakeup causes a significant wakeup latency for the Userspace client, which makes relay less suitable for the high-frequency low-payload use cases where the data gets generated at a very high rate, like multiple sub buffers getting filled within a milli second. Moreover the timer is re-scheduled on every newly produced sub buffer so the timer keeps getting pushed out if sub buffers are filled in a very quick succession (less than a jiffy gap between filling of 2 sub buffers). As a result relay runs out of sub buffers to store the new data. By using irq_work it is ensured that wakeup of userspace client, blocked in the poll call, is done at earliest (through self IPI or next timer tick) enabling it to always consume the data in time. Also this makes relay consistent with printk & ring buffers (trace), as they too use irq_work for deferred wake up of readers. [arnd@arndb.de: select CONFIG_IRQ_WORK] Link: http://lkml.kernel.org/r/20160912154035.3222156-1-arnd@arndb.de [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/1472906487-1559-1-git-send-email-akash.goel@intel.com Signed-off-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Akash Goel <akash.goel@intel.com> Cc: Tom Zanussi <tzanussi@gmail.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Signed-off-by: Arnd Bergmann <arnd@arndb.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-10-12 04:54:33 +08:00
* This is the function used to defer reader waking
*/
relay: Use irq_work instead of plain timer for deferred wakeup Relay avoids calling wake_up_interruptible() for doing the wakeup of readers/consumers, waiting for the generation of new data, from the context of a process which produced the data. This is apparently done to prevent the possibility of a deadlock in case Scheduler itself is is generating data for the relay, after acquiring rq->lock. The following patch used a timer (to be scheduled at next jiffy), for delegating the wakeup to another context. commit 7c9cb38302e78d24e37f7d8a2ea7eed4ae5f2fa7 Author: Tom Zanussi <zanussi@comcast.net> Date: Wed May 9 02:34:01 2007 -0700 relay: use plain timer instead of delayed work relay doesn't need to use schedule_delayed_work() for waking readers when a simple timer will do. Scheduling a plain timer, at next jiffies boundary, to do the wakeup causes a significant wakeup latency for the Userspace client, which makes relay less suitable for the high-frequency low-payload use cases where the data gets generated at a very high rate, like multiple sub buffers getting filled within a milli second. Moreover the timer is re-scheduled on every newly produced sub buffer so the timer keeps getting pushed out if sub buffers are filled in a very quick succession (less than a jiffy gap between filling of 2 sub buffers). As a result relay runs out of sub buffers to store the new data. By using irq_work it is ensured that wakeup of userspace client, blocked in the poll call, is done at earliest (through self IPI or next timer tick) enabling it to always consume the data in time. Also this makes relay consistent with printk & ring buffers (trace), as they too use irq_work for deferred wake up of readers. [arnd@arndb.de: select CONFIG_IRQ_WORK] Link: http://lkml.kernel.org/r/20160912154035.3222156-1-arnd@arndb.de [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/1472906487-1559-1-git-send-email-akash.goel@intel.com Signed-off-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Akash Goel <akash.goel@intel.com> Cc: Tom Zanussi <tzanussi@gmail.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Signed-off-by: Arnd Bergmann <arnd@arndb.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-10-12 04:54:33 +08:00
static void wakeup_readers(struct irq_work *work)
{
relay: Use irq_work instead of plain timer for deferred wakeup Relay avoids calling wake_up_interruptible() for doing the wakeup of readers/consumers, waiting for the generation of new data, from the context of a process which produced the data. This is apparently done to prevent the possibility of a deadlock in case Scheduler itself is is generating data for the relay, after acquiring rq->lock. The following patch used a timer (to be scheduled at next jiffy), for delegating the wakeup to another context. commit 7c9cb38302e78d24e37f7d8a2ea7eed4ae5f2fa7 Author: Tom Zanussi <zanussi@comcast.net> Date: Wed May 9 02:34:01 2007 -0700 relay: use plain timer instead of delayed work relay doesn't need to use schedule_delayed_work() for waking readers when a simple timer will do. Scheduling a plain timer, at next jiffies boundary, to do the wakeup causes a significant wakeup latency for the Userspace client, which makes relay less suitable for the high-frequency low-payload use cases where the data gets generated at a very high rate, like multiple sub buffers getting filled within a milli second. Moreover the timer is re-scheduled on every newly produced sub buffer so the timer keeps getting pushed out if sub buffers are filled in a very quick succession (less than a jiffy gap between filling of 2 sub buffers). As a result relay runs out of sub buffers to store the new data. By using irq_work it is ensured that wakeup of userspace client, blocked in the poll call, is done at earliest (through self IPI or next timer tick) enabling it to always consume the data in time. Also this makes relay consistent with printk & ring buffers (trace), as they too use irq_work for deferred wake up of readers. [arnd@arndb.de: select CONFIG_IRQ_WORK] Link: http://lkml.kernel.org/r/20160912154035.3222156-1-arnd@arndb.de [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/1472906487-1559-1-git-send-email-akash.goel@intel.com Signed-off-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Akash Goel <akash.goel@intel.com> Cc: Tom Zanussi <tzanussi@gmail.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Signed-off-by: Arnd Bergmann <arnd@arndb.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-10-12 04:54:33 +08:00
struct rchan_buf *buf;
buf = container_of(work, struct rchan_buf, wakeup_work);
wake_up_interruptible(&buf->read_wait);
}
/**
* __relay_reset - reset a channel buffer
* @buf: the channel buffer
* @init: 1 if this is a first-time initialization
*
* See relay_reset() for description of effect.
*/
static void __relay_reset(struct rchan_buf *buf, unsigned int init)
{
size_t i;
if (init) {
init_waitqueue_head(&buf->read_wait);
kref_init(&buf->kref);
relay: Use irq_work instead of plain timer for deferred wakeup Relay avoids calling wake_up_interruptible() for doing the wakeup of readers/consumers, waiting for the generation of new data, from the context of a process which produced the data. This is apparently done to prevent the possibility of a deadlock in case Scheduler itself is is generating data for the relay, after acquiring rq->lock. The following patch used a timer (to be scheduled at next jiffy), for delegating the wakeup to another context. commit 7c9cb38302e78d24e37f7d8a2ea7eed4ae5f2fa7 Author: Tom Zanussi <zanussi@comcast.net> Date: Wed May 9 02:34:01 2007 -0700 relay: use plain timer instead of delayed work relay doesn't need to use schedule_delayed_work() for waking readers when a simple timer will do. Scheduling a plain timer, at next jiffies boundary, to do the wakeup causes a significant wakeup latency for the Userspace client, which makes relay less suitable for the high-frequency low-payload use cases where the data gets generated at a very high rate, like multiple sub buffers getting filled within a milli second. Moreover the timer is re-scheduled on every newly produced sub buffer so the timer keeps getting pushed out if sub buffers are filled in a very quick succession (less than a jiffy gap between filling of 2 sub buffers). As a result relay runs out of sub buffers to store the new data. By using irq_work it is ensured that wakeup of userspace client, blocked in the poll call, is done at earliest (through self IPI or next timer tick) enabling it to always consume the data in time. Also this makes relay consistent with printk & ring buffers (trace), as they too use irq_work for deferred wake up of readers. [arnd@arndb.de: select CONFIG_IRQ_WORK] Link: http://lkml.kernel.org/r/20160912154035.3222156-1-arnd@arndb.de [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/1472906487-1559-1-git-send-email-akash.goel@intel.com Signed-off-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Akash Goel <akash.goel@intel.com> Cc: Tom Zanussi <tzanussi@gmail.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Signed-off-by: Arnd Bergmann <arnd@arndb.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-10-12 04:54:33 +08:00
init_irq_work(&buf->wakeup_work, wakeup_readers);
} else {
irq_work_sync(&buf->wakeup_work);
}
buf->subbufs_produced = 0;
buf->subbufs_consumed = 0;
buf->bytes_consumed = 0;
buf->finalized = 0;
buf->data = buf->start;
buf->offset = 0;
for (i = 0; i < buf->chan->n_subbufs; i++)
buf->padding[i] = 0;
relay_subbuf_start(buf, buf->data, NULL, 0);
}
/**
* relay_reset - reset the channel
* @chan: the channel
*
* This has the effect of erasing all data from all channel buffers
* and restarting the channel in its initial state. The buffers
* are not freed, so any mappings are still in effect.
*
* NOTE. Care should be taken that the channel isn't actually
* being used by anything when this call is made.
*/
void relay_reset(struct rchan *chan)
{
struct rchan_buf *buf;
unsigned int i;
if (!chan)
return;
if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) {
__relay_reset(buf, 0);
return;
}
mutex_lock(&relay_channels_mutex);
for_each_possible_cpu(i)
if ((buf = *per_cpu_ptr(chan->buf, i)))
__relay_reset(buf, 0);
mutex_unlock(&relay_channels_mutex);
}
EXPORT_SYMBOL_GPL(relay_reset);
static inline void relay_set_buf_dentry(struct rchan_buf *buf,
struct dentry *dentry)
{
buf->dentry = dentry;
d_inode(buf->dentry)->i_size = buf->early_bytes;
}
static struct dentry *relay_create_buf_file(struct rchan *chan,
struct rchan_buf *buf,
unsigned int cpu)
{
struct dentry *dentry;
char *tmpname;
tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL);
if (!tmpname)
return NULL;
snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu);
/* Create file in fs */
dentry = chan->cb->create_buf_file(tmpname, chan->parent,
S_IRUSR, buf,
&chan->is_global);
if (IS_ERR(dentry))
dentry = NULL;
kfree(tmpname);
return dentry;
}
/*
* relay_open_buf - create a new relay channel buffer
*
* used by relay_open() and CPU hotplug.
*/
static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
{
struct rchan_buf *buf = NULL;
struct dentry *dentry;
if (chan->is_global)
return *per_cpu_ptr(chan->buf, 0);
buf = relay_create_buf(chan);
if (!buf)
return NULL;
if (chan->has_base_filename) {
dentry = relay_create_buf_file(chan, buf, cpu);
if (!dentry)
goto free_buf;
relay_set_buf_dentry(buf, dentry);
relay: add global mode support for buffer-only channels Commit 20d8b67c06fa ("relay: add buffer-only channels; useful for early logging") added support to use channels with no associated files. This is useful when the exact location of relay file is not known or the the parent directory of relay file is not available, while creating the channel and the logging has to start right from the boot. But there was no provision to use global mode with buffer-only channels, which is added by this patch, without modifying the interface where initially there will be a dummy invocation of create_buf_file callback through which kernel client can convey the need of a global buffer. For the use case where drivers/kernel clients want a simple interface for the userspace, which enables them to capture data/logs from relay file inorder & without any post processing, support of Global buffer mode is warranted. Modules, like i915, using relay_open() in early init would have to later register their buffer-only relays, once debugfs is available, by calling relay_late_setup_files(). Hence relay_late_setup_files() symbol also needs to be exported. Link: http://lkml.kernel.org/r/1468404563-11653-1-git-send-email-akash.goel@intel.com Signed-off-by: Akash Goel <akash.goel@intel.com> Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro> Cc: Tom Zanussi <tzanussi@gmail.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-08-03 05:07:18 +08:00
} else {
/* Only retrieve global info, nothing more, nothing less */
dentry = chan->cb->create_buf_file(NULL, NULL,
S_IRUSR, buf,
&chan->is_global);
if (IS_ERR_OR_NULL(dentry))
relay: add global mode support for buffer-only channels Commit 20d8b67c06fa ("relay: add buffer-only channels; useful for early logging") added support to use channels with no associated files. This is useful when the exact location of relay file is not known or the the parent directory of relay file is not available, while creating the channel and the logging has to start right from the boot. But there was no provision to use global mode with buffer-only channels, which is added by this patch, without modifying the interface where initially there will be a dummy invocation of create_buf_file callback through which kernel client can convey the need of a global buffer. For the use case where drivers/kernel clients want a simple interface for the userspace, which enables them to capture data/logs from relay file inorder & without any post processing, support of Global buffer mode is warranted. Modules, like i915, using relay_open() in early init would have to later register their buffer-only relays, once debugfs is available, by calling relay_late_setup_files(). Hence relay_late_setup_files() symbol also needs to be exported. Link: http://lkml.kernel.org/r/1468404563-11653-1-git-send-email-akash.goel@intel.com Signed-off-by: Akash Goel <akash.goel@intel.com> Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro> Cc: Tom Zanussi <tzanussi@gmail.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-08-03 05:07:18 +08:00
goto free_buf;
}
buf->cpu = cpu;
__relay_reset(buf, 1);
if(chan->is_global) {
*per_cpu_ptr(chan->buf, 0) = buf;
buf->cpu = 0;
}
return buf;
free_buf:
relay_destroy_buf(buf);
return NULL;
}
/**
* relay_close_buf - close a channel buffer
* @buf: channel buffer
*
* Marks the buffer finalized and restores the default callbacks.
* The channel buffer and channel buffer data structure are then freed
* automatically when the last reference is given up.
*/
static void relay_close_buf(struct rchan_buf *buf)
{
buf->finalized = 1;
relay: Use irq_work instead of plain timer for deferred wakeup Relay avoids calling wake_up_interruptible() for doing the wakeup of readers/consumers, waiting for the generation of new data, from the context of a process which produced the data. This is apparently done to prevent the possibility of a deadlock in case Scheduler itself is is generating data for the relay, after acquiring rq->lock. The following patch used a timer (to be scheduled at next jiffy), for delegating the wakeup to another context. commit 7c9cb38302e78d24e37f7d8a2ea7eed4ae5f2fa7 Author: Tom Zanussi <zanussi@comcast.net> Date: Wed May 9 02:34:01 2007 -0700 relay: use plain timer instead of delayed work relay doesn't need to use schedule_delayed_work() for waking readers when a simple timer will do. Scheduling a plain timer, at next jiffies boundary, to do the wakeup causes a significant wakeup latency for the Userspace client, which makes relay less suitable for the high-frequency low-payload use cases where the data gets generated at a very high rate, like multiple sub buffers getting filled within a milli second. Moreover the timer is re-scheduled on every newly produced sub buffer so the timer keeps getting pushed out if sub buffers are filled in a very quick succession (less than a jiffy gap between filling of 2 sub buffers). As a result relay runs out of sub buffers to store the new data. By using irq_work it is ensured that wakeup of userspace client, blocked in the poll call, is done at earliest (through self IPI or next timer tick) enabling it to always consume the data in time. Also this makes relay consistent with printk & ring buffers (trace), as they too use irq_work for deferred wake up of readers. [arnd@arndb.de: select CONFIG_IRQ_WORK] Link: http://lkml.kernel.org/r/20160912154035.3222156-1-arnd@arndb.de [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/1472906487-1559-1-git-send-email-akash.goel@intel.com Signed-off-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Akash Goel <akash.goel@intel.com> Cc: Tom Zanussi <tzanussi@gmail.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Signed-off-by: Arnd Bergmann <arnd@arndb.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-10-12 04:54:33 +08:00
irq_work_sync(&buf->wakeup_work);
buf->chan->cb->remove_buf_file(buf->dentry);
kref_put(&buf->kref, relay_remove_buf);
}
int relay_prepare_cpu(unsigned int cpu)
{
struct rchan *chan;
struct rchan_buf *buf;
mutex_lock(&relay_channels_mutex);
list_for_each_entry(chan, &relay_channels, list) {
if (*per_cpu_ptr(chan->buf, cpu))
continue;
buf = relay_open_buf(chan, cpu);
if (!buf) {
pr_err("relay: cpu %d buffer creation failed\n", cpu);
mutex_unlock(&relay_channels_mutex);
return -ENOMEM;
}
*per_cpu_ptr(chan->buf, cpu) = buf;
}
mutex_unlock(&relay_channels_mutex);
return 0;
}
/**
* relay_open - create a new relay channel
* @base_filename: base name of files to create, %NULL for buffering only
* @parent: dentry of parent directory, %NULL for root directory or buffer
* @subbuf_size: size of sub-buffers
* @n_subbufs: number of sub-buffers
* @cb: client callback functions
* @private_data: user-defined data
*
* Returns channel pointer if successful, %NULL otherwise.
*
* Creates a channel buffer for each cpu using the sizes and
* attributes specified. The created channel buffer files
* will be named base_filename0...base_filenameN-1. File
* permissions will be %S_IRUSR.
relay: add global mode support for buffer-only channels Commit 20d8b67c06fa ("relay: add buffer-only channels; useful for early logging") added support to use channels with no associated files. This is useful when the exact location of relay file is not known or the the parent directory of relay file is not available, while creating the channel and the logging has to start right from the boot. But there was no provision to use global mode with buffer-only channels, which is added by this patch, without modifying the interface where initially there will be a dummy invocation of create_buf_file callback through which kernel client can convey the need of a global buffer. For the use case where drivers/kernel clients want a simple interface for the userspace, which enables them to capture data/logs from relay file inorder & without any post processing, support of Global buffer mode is warranted. Modules, like i915, using relay_open() in early init would have to later register their buffer-only relays, once debugfs is available, by calling relay_late_setup_files(). Hence relay_late_setup_files() symbol also needs to be exported. Link: http://lkml.kernel.org/r/1468404563-11653-1-git-send-email-akash.goel@intel.com Signed-off-by: Akash Goel <akash.goel@intel.com> Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro> Cc: Tom Zanussi <tzanussi@gmail.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-08-03 05:07:18 +08:00
*
* If opening a buffer (@parent = NULL) that you later wish to register
* in a filesystem, call relay_late_setup_files() once the @parent dentry
* is available.
*/
struct rchan *relay_open(const char *base_filename,
struct dentry *parent,
size_t subbuf_size,
size_t n_subbufs,
const struct rchan_callbacks *cb,
void *private_data)
{
unsigned int i;
struct rchan *chan;
struct rchan_buf *buf;
if (!(subbuf_size && n_subbufs))
return NULL;
if (subbuf_size > UINT_MAX / n_subbufs)
return NULL;
if (!cb || !cb->create_buf_file || !cb->remove_buf_file)
return NULL;
chan = kzalloc(sizeof(struct rchan), GFP_KERNEL);
if (!chan)
return NULL;
chan->buf = alloc_percpu(struct rchan_buf *);
kernel/relay.c: handle alloc_percpu returning NULL in relay_open alloc_percpu() may return NULL, which means chan->buf may be set to NULL. In that case, when we do *per_cpu_ptr(chan->buf, ...), we dereference an invalid pointer: BUG: Unable to handle kernel data access at 0x7dae0000 Faulting instruction address: 0xc0000000003f3fec ... NIP relay_open+0x29c/0x600 LR relay_open+0x270/0x600 Call Trace: relay_open+0x264/0x600 (unreliable) __blk_trace_setup+0x254/0x600 blk_trace_setup+0x68/0xa0 sg_ioctl+0x7bc/0x2e80 do_vfs_ioctl+0x13c/0x1300 ksys_ioctl+0x94/0x130 sys_ioctl+0x48/0xb0 system_call+0x5c/0x68 Check if alloc_percpu returns NULL. This was found by syzkaller both on x86 and powerpc, and the reproducer it found on powerpc is capable of hitting the issue as an unprivileged user. Fixes: 017c59c042d0 ("relay: Use per CPU constructs for the relay channel buffer pointers") Reported-by: syzbot+1e925b4b836afe85a1c6@syzkaller-ppc64.appspotmail.com Reported-by: syzbot+587b2421926808309d21@syzkaller-ppc64.appspotmail.com Reported-by: syzbot+58320b7171734bf79d26@syzkaller.appspotmail.com Reported-by: syzbot+d6074fb08bdb2e010520@syzkaller.appspotmail.com Signed-off-by: Daniel Axtens <dja@axtens.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Reviewed-by: Michael Ellerman <mpe@ellerman.id.au> Reviewed-by: Andrew Donnellan <ajd@linux.ibm.com> Acked-by: David Rientjes <rientjes@google.com> Cc: Akash Goel <akash.goel@intel.com> Cc: Andrew Donnellan <ajd@linux.ibm.com> Cc: Guenter Roeck <linux@roeck-us.net> Cc: Salvatore Bonaccorso <carnil@debian.org> Cc: <stable@vger.kernel.org> [4.10+] Link: http://lkml.kernel.org/r/20191219121256.26480-1-dja@axtens.net Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-05 07:51:27 +08:00
if (!chan->buf) {
kfree(chan);
return NULL;
}
chan->version = RELAYFS_CHANNEL_VERSION;
chan->n_subbufs = n_subbufs;
chan->subbuf_size = subbuf_size;
chan->alloc_size = PAGE_ALIGN(subbuf_size * n_subbufs);
chan->parent = parent;
chan->private_data = private_data;
if (base_filename) {
chan->has_base_filename = 1;
strscpy(chan->base_filename, base_filename, NAME_MAX);
}
chan->cb = cb;
kref_init(&chan->kref);
mutex_lock(&relay_channels_mutex);
for_each_online_cpu(i) {
buf = relay_open_buf(chan, i);
if (!buf)
goto free_bufs;
*per_cpu_ptr(chan->buf, i) = buf;
}
list_add(&chan->list, &relay_channels);
mutex_unlock(&relay_channels_mutex);
return chan;
free_bufs:
for_each_possible_cpu(i) {
if ((buf = *per_cpu_ptr(chan->buf, i)))
relay_close_buf(buf);
}
kref_put(&chan->kref, relay_destroy_channel);
mutex_unlock(&relay_channels_mutex);
return NULL;
}
EXPORT_SYMBOL_GPL(relay_open);
struct rchan_percpu_buf_dispatcher {
struct rchan_buf *buf;
struct dentry *dentry;
};
/* Called in atomic context. */
static void __relay_set_buf_dentry(void *info)
{
struct rchan_percpu_buf_dispatcher *p = info;
relay_set_buf_dentry(p->buf, p->dentry);
}
/**
* relay_late_setup_files - triggers file creation
* @chan: channel to operate on
* @base_filename: base name of files to create
* @parent: dentry of parent directory, %NULL for root directory
*
* Returns 0 if successful, non-zero otherwise.
*
relay: add global mode support for buffer-only channels Commit 20d8b67c06fa ("relay: add buffer-only channels; useful for early logging") added support to use channels with no associated files. This is useful when the exact location of relay file is not known or the the parent directory of relay file is not available, while creating the channel and the logging has to start right from the boot. But there was no provision to use global mode with buffer-only channels, which is added by this patch, without modifying the interface where initially there will be a dummy invocation of create_buf_file callback through which kernel client can convey the need of a global buffer. For the use case where drivers/kernel clients want a simple interface for the userspace, which enables them to capture data/logs from relay file inorder & without any post processing, support of Global buffer mode is warranted. Modules, like i915, using relay_open() in early init would have to later register their buffer-only relays, once debugfs is available, by calling relay_late_setup_files(). Hence relay_late_setup_files() symbol also needs to be exported. Link: http://lkml.kernel.org/r/1468404563-11653-1-git-send-email-akash.goel@intel.com Signed-off-by: Akash Goel <akash.goel@intel.com> Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro> Cc: Tom Zanussi <tzanussi@gmail.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-08-03 05:07:18 +08:00
* Use to setup files for a previously buffer-only channel created
* by relay_open() with a NULL parent dentry.
*
* For example, this is useful for perfomring early tracing in kernel,
* before VFS is up and then exposing the early results once the dentry
* is available.
*/
int relay_late_setup_files(struct rchan *chan,
const char *base_filename,
struct dentry *parent)
{
int err = 0;
unsigned int i, curr_cpu;
unsigned long flags;
struct dentry *dentry;
struct rchan_buf *buf;
struct rchan_percpu_buf_dispatcher disp;
if (!chan || !base_filename)
return -EINVAL;
strscpy(chan->base_filename, base_filename, NAME_MAX);
mutex_lock(&relay_channels_mutex);
/* Is chan already set up? */
if (unlikely(chan->has_base_filename)) {
mutex_unlock(&relay_channels_mutex);
return -EEXIST;
}
chan->has_base_filename = 1;
chan->parent = parent;
relay: add global mode support for buffer-only channels Commit 20d8b67c06fa ("relay: add buffer-only channels; useful for early logging") added support to use channels with no associated files. This is useful when the exact location of relay file is not known or the the parent directory of relay file is not available, while creating the channel and the logging has to start right from the boot. But there was no provision to use global mode with buffer-only channels, which is added by this patch, without modifying the interface where initially there will be a dummy invocation of create_buf_file callback through which kernel client can convey the need of a global buffer. For the use case where drivers/kernel clients want a simple interface for the userspace, which enables them to capture data/logs from relay file inorder & without any post processing, support of Global buffer mode is warranted. Modules, like i915, using relay_open() in early init would have to later register their buffer-only relays, once debugfs is available, by calling relay_late_setup_files(). Hence relay_late_setup_files() symbol also needs to be exported. Link: http://lkml.kernel.org/r/1468404563-11653-1-git-send-email-akash.goel@intel.com Signed-off-by: Akash Goel <akash.goel@intel.com> Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro> Cc: Tom Zanussi <tzanussi@gmail.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-08-03 05:07:18 +08:00
if (chan->is_global) {
err = -EINVAL;
buf = *per_cpu_ptr(chan->buf, 0);
if (!WARN_ON_ONCE(!buf)) {
dentry = relay_create_buf_file(chan, buf, 0);
relay: add global mode support for buffer-only channels Commit 20d8b67c06fa ("relay: add buffer-only channels; useful for early logging") added support to use channels with no associated files. This is useful when the exact location of relay file is not known or the the parent directory of relay file is not available, while creating the channel and the logging has to start right from the boot. But there was no provision to use global mode with buffer-only channels, which is added by this patch, without modifying the interface where initially there will be a dummy invocation of create_buf_file callback through which kernel client can convey the need of a global buffer. For the use case where drivers/kernel clients want a simple interface for the userspace, which enables them to capture data/logs from relay file inorder & without any post processing, support of Global buffer mode is warranted. Modules, like i915, using relay_open() in early init would have to later register their buffer-only relays, once debugfs is available, by calling relay_late_setup_files(). Hence relay_late_setup_files() symbol also needs to be exported. Link: http://lkml.kernel.org/r/1468404563-11653-1-git-send-email-akash.goel@intel.com Signed-off-by: Akash Goel <akash.goel@intel.com> Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro> Cc: Tom Zanussi <tzanussi@gmail.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-08-03 05:07:18 +08:00
if (dentry && !WARN_ON_ONCE(!chan->is_global)) {
relay_set_buf_dentry(buf, dentry);
relay: add global mode support for buffer-only channels Commit 20d8b67c06fa ("relay: add buffer-only channels; useful for early logging") added support to use channels with no associated files. This is useful when the exact location of relay file is not known or the the parent directory of relay file is not available, while creating the channel and the logging has to start right from the boot. But there was no provision to use global mode with buffer-only channels, which is added by this patch, without modifying the interface where initially there will be a dummy invocation of create_buf_file callback through which kernel client can convey the need of a global buffer. For the use case where drivers/kernel clients want a simple interface for the userspace, which enables them to capture data/logs from relay file inorder & without any post processing, support of Global buffer mode is warranted. Modules, like i915, using relay_open() in early init would have to later register their buffer-only relays, once debugfs is available, by calling relay_late_setup_files(). Hence relay_late_setup_files() symbol also needs to be exported. Link: http://lkml.kernel.org/r/1468404563-11653-1-git-send-email-akash.goel@intel.com Signed-off-by: Akash Goel <akash.goel@intel.com> Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro> Cc: Tom Zanussi <tzanussi@gmail.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-08-03 05:07:18 +08:00
err = 0;
}
}
mutex_unlock(&relay_channels_mutex);
return err;
}
curr_cpu = get_cpu();
/*
* The CPU hotplug notifier ran before us and created buffers with
* no files associated. So it's safe to call relay_setup_buf_file()
* on all currently online CPUs.
*/
for_each_online_cpu(i) {
buf = *per_cpu_ptr(chan->buf, i);
if (unlikely(!buf)) {
WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n");
err = -EINVAL;
break;
}
dentry = relay_create_buf_file(chan, buf, i);
if (unlikely(!dentry)) {
err = -EINVAL;
break;
}
if (curr_cpu == i) {
local_irq_save(flags);
relay_set_buf_dentry(buf, dentry);
local_irq_restore(flags);
} else {
disp.buf = buf;
disp.dentry = dentry;
smp_mb();
/* relay_channels_mutex must be held, so wait. */
err = smp_call_function_single(i,
__relay_set_buf_dentry,
&disp, 1);
}
if (unlikely(err))
break;
}
put_cpu();
mutex_unlock(&relay_channels_mutex);
return err;
}
relay: add global mode support for buffer-only channels Commit 20d8b67c06fa ("relay: add buffer-only channels; useful for early logging") added support to use channels with no associated files. This is useful when the exact location of relay file is not known or the the parent directory of relay file is not available, while creating the channel and the logging has to start right from the boot. But there was no provision to use global mode with buffer-only channels, which is added by this patch, without modifying the interface where initially there will be a dummy invocation of create_buf_file callback through which kernel client can convey the need of a global buffer. For the use case where drivers/kernel clients want a simple interface for the userspace, which enables them to capture data/logs from relay file inorder & without any post processing, support of Global buffer mode is warranted. Modules, like i915, using relay_open() in early init would have to later register their buffer-only relays, once debugfs is available, by calling relay_late_setup_files(). Hence relay_late_setup_files() symbol also needs to be exported. Link: http://lkml.kernel.org/r/1468404563-11653-1-git-send-email-akash.goel@intel.com Signed-off-by: Akash Goel <akash.goel@intel.com> Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro> Cc: Tom Zanussi <tzanussi@gmail.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-08-03 05:07:18 +08:00
EXPORT_SYMBOL_GPL(relay_late_setup_files);
/**
* relay_switch_subbuf - switch to a new sub-buffer
* @buf: channel buffer
* @length: size of current event
*
* Returns either the length passed in or 0 if full.
*
* Performs sub-buffer-switch tasks such as invoking callbacks,
* updating padding counts, waking up readers, etc.
*/
size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
{
void *old, *new;
size_t old_subbuf, new_subbuf;
if (unlikely(length > buf->chan->subbuf_size))
goto toobig;
if (buf->offset != buf->chan->subbuf_size + 1) {
buf->prev_padding = buf->chan->subbuf_size - buf->offset;
old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
buf->padding[old_subbuf] = buf->prev_padding;
buf->subbufs_produced++;
if (buf->dentry)
d_inode(buf->dentry)->i_size +=
buf->chan->subbuf_size -
buf->padding[old_subbuf];
else
buf->early_bytes += buf->chan->subbuf_size -
buf->padding[old_subbuf];
smp_mb();
relay: Use irq_work instead of plain timer for deferred wakeup Relay avoids calling wake_up_interruptible() for doing the wakeup of readers/consumers, waiting for the generation of new data, from the context of a process which produced the data. This is apparently done to prevent the possibility of a deadlock in case Scheduler itself is is generating data for the relay, after acquiring rq->lock. The following patch used a timer (to be scheduled at next jiffy), for delegating the wakeup to another context. commit 7c9cb38302e78d24e37f7d8a2ea7eed4ae5f2fa7 Author: Tom Zanussi <zanussi@comcast.net> Date: Wed May 9 02:34:01 2007 -0700 relay: use plain timer instead of delayed work relay doesn't need to use schedule_delayed_work() for waking readers when a simple timer will do. Scheduling a plain timer, at next jiffies boundary, to do the wakeup causes a significant wakeup latency for the Userspace client, which makes relay less suitable for the high-frequency low-payload use cases where the data gets generated at a very high rate, like multiple sub buffers getting filled within a milli second. Moreover the timer is re-scheduled on every newly produced sub buffer so the timer keeps getting pushed out if sub buffers are filled in a very quick succession (less than a jiffy gap between filling of 2 sub buffers). As a result relay runs out of sub buffers to store the new data. By using irq_work it is ensured that wakeup of userspace client, blocked in the poll call, is done at earliest (through self IPI or next timer tick) enabling it to always consume the data in time. Also this makes relay consistent with printk & ring buffers (trace), as they too use irq_work for deferred wake up of readers. [arnd@arndb.de: select CONFIG_IRQ_WORK] Link: http://lkml.kernel.org/r/20160912154035.3222156-1-arnd@arndb.de [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/1472906487-1559-1-git-send-email-akash.goel@intel.com Signed-off-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Akash Goel <akash.goel@intel.com> Cc: Tom Zanussi <tzanussi@gmail.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Signed-off-by: Arnd Bergmann <arnd@arndb.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-10-12 04:54:33 +08:00
if (waitqueue_active(&buf->read_wait)) {
/*
* Calling wake_up_interruptible() from here
* will deadlock if we happen to be logging
* from the scheduler (trying to re-grab
* rq->lock), so defer it.
*/
relay: Use irq_work instead of plain timer for deferred wakeup Relay avoids calling wake_up_interruptible() for doing the wakeup of readers/consumers, waiting for the generation of new data, from the context of a process which produced the data. This is apparently done to prevent the possibility of a deadlock in case Scheduler itself is is generating data for the relay, after acquiring rq->lock. The following patch used a timer (to be scheduled at next jiffy), for delegating the wakeup to another context. commit 7c9cb38302e78d24e37f7d8a2ea7eed4ae5f2fa7 Author: Tom Zanussi <zanussi@comcast.net> Date: Wed May 9 02:34:01 2007 -0700 relay: use plain timer instead of delayed work relay doesn't need to use schedule_delayed_work() for waking readers when a simple timer will do. Scheduling a plain timer, at next jiffies boundary, to do the wakeup causes a significant wakeup latency for the Userspace client, which makes relay less suitable for the high-frequency low-payload use cases where the data gets generated at a very high rate, like multiple sub buffers getting filled within a milli second. Moreover the timer is re-scheduled on every newly produced sub buffer so the timer keeps getting pushed out if sub buffers are filled in a very quick succession (less than a jiffy gap between filling of 2 sub buffers). As a result relay runs out of sub buffers to store the new data. By using irq_work it is ensured that wakeup of userspace client, blocked in the poll call, is done at earliest (through self IPI or next timer tick) enabling it to always consume the data in time. Also this makes relay consistent with printk & ring buffers (trace), as they too use irq_work for deferred wake up of readers. [arnd@arndb.de: select CONFIG_IRQ_WORK] Link: http://lkml.kernel.org/r/20160912154035.3222156-1-arnd@arndb.de [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/1472906487-1559-1-git-send-email-akash.goel@intel.com Signed-off-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Akash Goel <akash.goel@intel.com> Cc: Tom Zanussi <tzanussi@gmail.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Signed-off-by: Arnd Bergmann <arnd@arndb.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-10-12 04:54:33 +08:00
irq_work_queue(&buf->wakeup_work);
}
}
old = buf->data;
new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
new = buf->start + new_subbuf * buf->chan->subbuf_size;
buf->offset = 0;
if (!relay_subbuf_start(buf, new, old, buf->prev_padding)) {
buf->offset = buf->chan->subbuf_size + 1;
return 0;
}
buf->data = new;
buf->padding[new_subbuf] = 0;
if (unlikely(length + buf->offset > buf->chan->subbuf_size))
goto toobig;
return length;
toobig:
buf->chan->last_toobig = length;
return 0;
}
EXPORT_SYMBOL_GPL(relay_switch_subbuf);
/**
* relay_subbufs_consumed - update the buffer's sub-buffers-consumed count
* @chan: the channel
* @cpu: the cpu associated with the channel buffer to update
* @subbufs_consumed: number of sub-buffers to add to current buf's count
*
* Adds to the channel buffer's consumed sub-buffer count.
* subbufs_consumed should be the number of sub-buffers newly consumed,
* not the total consumed.
*
* NOTE. Kernel clients don't need to call this function if the channel
* mode is 'overwrite'.
*/
void relay_subbufs_consumed(struct rchan *chan,
unsigned int cpu,
size_t subbufs_consumed)
{
struct rchan_buf *buf;
if (!chan || cpu >= NR_CPUS)
return;
buf = *per_cpu_ptr(chan->buf, cpu);
if (!buf || subbufs_consumed > chan->n_subbufs)
return;
if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed)
buf->subbufs_consumed = buf->subbufs_produced;
else
buf->subbufs_consumed += subbufs_consumed;
}
EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
/**
* relay_close - close the channel
* @chan: the channel
*
* Closes all channel buffers and frees the channel.
*/
void relay_close(struct rchan *chan)
{
struct rchan_buf *buf;
unsigned int i;
if (!chan)
return;
mutex_lock(&relay_channels_mutex);
if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0)))
relay_close_buf(buf);
else
for_each_possible_cpu(i)
if ((buf = *per_cpu_ptr(chan->buf, i)))
relay_close_buf(buf);
if (chan->last_toobig)
printk(KERN_WARNING "relay: one or more items not logged "
"[item size (%zd) > sub-buffer size (%zd)]\n",
chan->last_toobig, chan->subbuf_size);
list_del(&chan->list);
kref_put(&chan->kref, relay_destroy_channel);
mutex_unlock(&relay_channels_mutex);
}
EXPORT_SYMBOL_GPL(relay_close);
/**
* relay_flush - close the channel
* @chan: the channel
*
* Flushes all channel buffers, i.e. forces buffer switch.
*/
void relay_flush(struct rchan *chan)
{
struct rchan_buf *buf;
unsigned int i;
if (!chan)
return;
if (chan->is_global && (buf = *per_cpu_ptr(chan->buf, 0))) {
relay_switch_subbuf(buf, 0);
return;
}
mutex_lock(&relay_channels_mutex);
for_each_possible_cpu(i)
if ((buf = *per_cpu_ptr(chan->buf, i)))
relay_switch_subbuf(buf, 0);
mutex_unlock(&relay_channels_mutex);
}
EXPORT_SYMBOL_GPL(relay_flush);
/**
* relay_file_open - open file op for relay files
* @inode: the inode
* @filp: the file
*
* Increments the channel buffer refcount.
*/
static int relay_file_open(struct inode *inode, struct file *filp)
{
struct rchan_buf *buf = inode->i_private;
kref_get(&buf->kref);
filp->private_data = buf;
return nonseekable_open(inode, filp);
}
/**
* relay_file_mmap - mmap file op for relay files
* @filp: the file
* @vma: the vma describing what to map
*
* Calls upon relay_mmap_buf() to map the file into user space.
*/
static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct rchan_buf *buf = filp->private_data;
return relay_mmap_buf(buf, vma);
}
/**
* relay_file_poll - poll file op for relay files
* @filp: the file
* @wait: poll table
*
* Poll implemention.
*/
static __poll_t relay_file_poll(struct file *filp, poll_table *wait)
{
__poll_t mask = 0;
struct rchan_buf *buf = filp->private_data;
if (buf->finalized)
return EPOLLERR;
if (filp->f_mode & FMODE_READ) {
poll_wait(filp, &buf->read_wait, wait);
if (!relay_buf_empty(buf))
mask |= EPOLLIN | EPOLLRDNORM;
}
return mask;
}
/**
* relay_file_release - release file op for relay files
* @inode: the inode
* @filp: the file
*
* Decrements the channel refcount, as the filesystem is
* no longer using it.
*/
static int relay_file_release(struct inode *inode, struct file *filp)
{
struct rchan_buf *buf = filp->private_data;
kref_put(&buf->kref, relay_remove_buf);
return 0;
}
/*
* relay_file_read_consume - update the consumed count for the buffer
*/
static void relay_file_read_consume(struct rchan_buf *buf,
size_t read_pos,
size_t bytes_consumed)
{
size_t subbuf_size = buf->chan->subbuf_size;
size_t n_subbufs = buf->chan->n_subbufs;
size_t read_subbuf;
if (buf->subbufs_produced == buf->subbufs_consumed &&
buf->offset == buf->bytes_consumed)
return;
if (buf->bytes_consumed + bytes_consumed > subbuf_size) {
relay_subbufs_consumed(buf->chan, buf->cpu, 1);
buf->bytes_consumed = 0;
}
buf->bytes_consumed += bytes_consumed;
if (!read_pos)
read_subbuf = buf->subbufs_consumed % n_subbufs;
else
read_subbuf = read_pos / buf->chan->subbuf_size;
if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) {
if ((read_subbuf == buf->subbufs_produced % n_subbufs) &&
(buf->offset == subbuf_size))
return;
relay_subbufs_consumed(buf->chan, buf->cpu, 1);
buf->bytes_consumed = 0;
}
}
/*
* relay_file_read_avail - boolean, are there unconsumed bytes available?
*/
static int relay_file_read_avail(struct rchan_buf *buf)
{
size_t subbuf_size = buf->chan->subbuf_size;
size_t n_subbufs = buf->chan->n_subbufs;
size_t produced = buf->subbufs_produced;
size_t consumed;
relay_file_read_consume(buf, 0, 0);
consumed = buf->subbufs_consumed;
if (unlikely(buf->offset > subbuf_size)) {
if (produced == consumed)
return 0;
return 1;
}
if (unlikely(produced - consumed >= n_subbufs)) {
consumed = produced - n_subbufs + 1;
buf->subbufs_consumed = consumed;
buf->bytes_consumed = 0;
}
produced = (produced % n_subbufs) * subbuf_size + buf->offset;
consumed = (consumed % n_subbufs) * subbuf_size + buf->bytes_consumed;
if (consumed > produced)
produced += n_subbufs * subbuf_size;
if (consumed == produced) {
if (buf->offset == subbuf_size &&
buf->subbufs_produced > buf->subbufs_consumed)
return 1;
return 0;
}
return 1;
}
/**
* relay_file_read_subbuf_avail - return bytes available in sub-buffer
* @read_pos: file read position
* @buf: relay channel buffer
*/
static size_t relay_file_read_subbuf_avail(size_t read_pos,
struct rchan_buf *buf)
{
size_t padding, avail = 0;
size_t read_subbuf, read_offset, write_subbuf, write_offset;
size_t subbuf_size = buf->chan->subbuf_size;
write_subbuf = (buf->data - buf->start) / subbuf_size;
write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
read_subbuf = read_pos / subbuf_size;
read_offset = read_pos % subbuf_size;
padding = buf->padding[read_subbuf];
if (read_subbuf == write_subbuf) {
if (read_offset + padding < write_offset)
avail = write_offset - (read_offset + padding);
} else
avail = (subbuf_size - padding) - read_offset;
return avail;
}
/**
* relay_file_read_start_pos - find the first available byte to read
* @buf: relay channel buffer
*
* If the read_pos is in the middle of padding, return the
* position of the first actually available byte, otherwise
* return the original value.
*/
static size_t relay_file_read_start_pos(struct rchan_buf *buf)
{
size_t read_subbuf, padding, padding_start, padding_end;
size_t subbuf_size = buf->chan->subbuf_size;
size_t n_subbufs = buf->chan->n_subbufs;
size_t consumed = buf->subbufs_consumed % n_subbufs;
relayfs: fix out-of-bounds access in relay_file_read There is a crash in relay_file_read, as the var from point to the end of last subbuf. The oops looks something like: pc : __arch_copy_to_user+0x180/0x310 lr : relay_file_read+0x20c/0x2c8 Call trace: __arch_copy_to_user+0x180/0x310 full_proxy_read+0x68/0x98 vfs_read+0xb0/0x1d0 ksys_read+0x6c/0xf0 __arm64_sys_read+0x20/0x28 el0_svc_common.constprop.3+0x84/0x108 do_el0_svc+0x74/0x90 el0_svc+0x1c/0x28 el0_sync_handler+0x88/0xb0 el0_sync+0x148/0x180 We get the condition by analyzing the vmcore: 1). The last produced byte and last consumed byte both at the end of the last subbuf 2). A softirq calls function(e.g __blk_add_trace) to write relay buffer occurs when an program is calling relay_file_read_avail(). relay_file_read relay_file_read_avail relay_file_read_consume(buf, 0, 0); //interrupted by softirq who will write subbuf .... return 1; //read_start point to the end of the last subbuf read_start = relay_file_read_start_pos //avail is equal to subsize avail = relay_file_read_subbuf_avail //from points to an invalid memory address from = buf->start + read_start //system is crashed copy_to_user(buffer, from, avail) Link: https://lkml.kernel.org/r/20230419040203.37676-1-zhang.zhengming@h3c.com Fixes: 8d62fdebdaf9 ("relay file read: start-pos fix") Signed-off-by: Zhang Zhengming <zhang.zhengming@h3c.com> Reviewed-by: Zhao Lei <zhao_lei1@hoperun.com> Reviewed-by: Zhou Kete <zhou.kete@h3c.com> Reviewed-by: Pengcheng Yang <yangpc@wangsu.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-04-19 12:02:03 +08:00
size_t read_pos = (consumed * subbuf_size + buf->bytes_consumed)
% (n_subbufs * subbuf_size);
read_subbuf = read_pos / subbuf_size;
padding = buf->padding[read_subbuf];
padding_start = (read_subbuf + 1) * subbuf_size - padding;
padding_end = (read_subbuf + 1) * subbuf_size;
if (read_pos >= padding_start && read_pos < padding_end) {
read_subbuf = (read_subbuf + 1) % n_subbufs;
read_pos = read_subbuf * subbuf_size;
}
return read_pos;
}
/**
* relay_file_read_end_pos - return the new read position
* @read_pos: file read position
* @buf: relay channel buffer
* @count: number of bytes to be read
*/
static size_t relay_file_read_end_pos(struct rchan_buf *buf,
size_t read_pos,
size_t count)
{
size_t read_subbuf, padding, end_pos;
size_t subbuf_size = buf->chan->subbuf_size;
size_t n_subbufs = buf->chan->n_subbufs;
read_subbuf = read_pos / subbuf_size;
padding = buf->padding[read_subbuf];
if (read_pos % subbuf_size + count + padding == subbuf_size)
end_pos = (read_subbuf + 1) * subbuf_size;
else
end_pos = read_pos + count;
if (end_pos >= subbuf_size * n_subbufs)
end_pos = 0;
return end_pos;
}
static ssize_t relay_file_read(struct file *filp,
char __user *buffer,
size_t count,
loff_t *ppos)
{
struct rchan_buf *buf = filp->private_data;
size_t read_start, avail;
size_t written = 0;
int ret;
if (!count)
return 0;
inode_lock(file_inode(filp));
do {
void *from;
if (!relay_file_read_avail(buf))
break;
read_start = relay_file_read_start_pos(buf);
avail = relay_file_read_subbuf_avail(read_start, buf);
if (!avail)
break;
avail = min(count, avail);
from = buf->start + read_start;
ret = avail;
if (copy_to_user(buffer, from, avail))
break;
buffer += ret;
written += ret;
count -= ret;
relay_file_read_consume(buf, read_start, ret);
*ppos = relay_file_read_end_pos(buf, read_start, ret);
} while (count);
inode_unlock(file_inode(filp));
return written;
}
static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed)
{
rbuf->bytes_consumed += bytes_consumed;
if (rbuf->bytes_consumed >= rbuf->chan->subbuf_size) {
relay_subbufs_consumed(rbuf->chan, rbuf->cpu, 1);
rbuf->bytes_consumed %= rbuf->chan->subbuf_size;
}
}
static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
struct rchan_buf *rbuf;
rbuf = (struct rchan_buf *)page_private(buf->page);
relay_consume_bytes(rbuf, buf->private);
}
static const struct pipe_buf_operations relay_pipe_buf_ops = {
.release = relay_pipe_buf_release,
.try_steal = generic_pipe_buf_try_steal,
.get = generic_pipe_buf_get,
};
static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
{
}
/*
* subbuf_splice_actor - splice up to one subbuf's worth of data
*/
static ssize_t subbuf_splice_actor(struct file *in,
loff_t *ppos,
struct pipe_inode_info *pipe,
size_t len,
unsigned int flags,
int *nonpad_ret)
{
unsigned int pidx, poff, total_len, subbuf_pages, nr_pages;
struct rchan_buf *rbuf = in->private_data;
unsigned int subbuf_size = rbuf->chan->subbuf_size;
uint64_t pos = (uint64_t) *ppos;
uint32_t alloc_size = (uint32_t) rbuf->chan->alloc_size;
size_t read_start = (size_t) do_div(pos, alloc_size);
size_t read_subbuf = read_start / subbuf_size;
size_t padding = rbuf->padding[read_subbuf];
size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;
struct page *pages[PIPE_DEF_BUFFERS];
struct partial_page partial[PIPE_DEF_BUFFERS];
struct splice_pipe_desc spd = {
.pages = pages,
.nr_pages = 0,
.nr_pages_max = PIPE_DEF_BUFFERS,
.partial = partial,
.ops = &relay_pipe_buf_ops,
.spd_release = relay_page_release,
};
ssize_t ret;
if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
return 0;
if (splice_grow_spd(pipe, &spd))
return -ENOMEM;
/*
* Adjust read len, if longer than what is available
*/
if (len > (subbuf_size - read_start % subbuf_size))
len = subbuf_size - read_start % subbuf_size;
subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
pidx = (read_start / PAGE_SIZE) % subbuf_pages;
poff = read_start & ~PAGE_MASK;
nr_pages = min_t(unsigned int, subbuf_pages, spd.nr_pages_max);
for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
unsigned int this_len, this_end, private;
unsigned int cur_pos = read_start + total_len;
if (!len)
break;
this_len = min_t(unsigned long, len, PAGE_SIZE - poff);
private = this_len;
spd.pages[spd.nr_pages] = rbuf->page_array[pidx];
spd.partial[spd.nr_pages].offset = poff;
this_end = cur_pos + this_len;
if (this_end >= nonpad_end) {
this_len = nonpad_end - cur_pos;
private = this_len + padding;
}
spd.partial[spd.nr_pages].len = this_len;
spd.partial[spd.nr_pages].private = private;
len -= this_len;
total_len += this_len;
poff = 0;
pidx = (pidx + 1) % subbuf_pages;
if (this_end >= nonpad_end) {
spd.nr_pages++;
break;
}
}
ret = 0;
if (!spd.nr_pages)
goto out;
ret = *nonpad_ret = splice_to_pipe(pipe, &spd);
if (ret < 0 || ret < total_len)
goto out;
if (read_start + ret == nonpad_end)
ret += padding;
out:
splice_shrink_spd(&spd);
return ret;
}
static ssize_t relay_file_splice_read(struct file *in,
loff_t *ppos,
struct pipe_inode_info *pipe,
size_t len,
unsigned int flags)
{
ssize_t spliced;
int ret;
int nonpad_ret = 0;
ret = 0;
spliced = 0;
while (len && !spliced) {
ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret);
if (ret < 0)
break;
else if (!ret) {
if (flags & SPLICE_F_NONBLOCK)
ret = -EAGAIN;
break;
}
*ppos += ret;
if (ret > len)
len = 0;
else
len -= ret;
spliced += nonpad_ret;
nonpad_ret = 0;
}
if (spliced)
return spliced;
return ret;
}
const struct file_operations relay_file_operations = {
.open = relay_file_open,
.poll = relay_file_poll,
.mmap = relay_file_mmap,
.read = relay_file_read,
.llseek = no_llseek,
.release = relay_file_release,
.splice_read = relay_file_splice_read,
};
EXPORT_SYMBOL_GPL(relay_file_operations);