linux/fs/pstore/platform.c

849 lines
19 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
* Persistent Storage - platform driver interface parts.
*
* Copyright (C) 2007-2008 Google, Inc.
* Copyright (C) 2010 Intel Corporation <tony.luck@intel.com>
*/
#define pr_fmt(fmt) "pstore: " fmt
#include <linux/atomic.h>
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/kmsg_dump.h>
#include <linux/console.h>
#include <linux/module.h>
#include <linux/pstore.h>
2018-03-15 23:34:08 +08:00
#if IS_ENABLED(CONFIG_PSTORE_LZO_COMPRESS)
#include <linux/lzo.h>
#endif
2018-03-15 23:34:08 +08:00
#if IS_ENABLED(CONFIG_PSTORE_LZ4_COMPRESS) || IS_ENABLED(CONFIG_PSTORE_LZ4HC_COMPRESS)
#include <linux/lz4.h>
#endif
#if IS_ENABLED(CONFIG_PSTORE_ZSTD_COMPRESS)
#include <linux/zstd.h>
#endif
#include <linux/crypto.h>
#include <linux/string.h>
#include <linux/timer.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/jiffies.h>
#include <linux/workqueue.h>
#include "internal.h"
/*
* We defer making "oops" entries appear in pstore - see
* whether the system is actually still running well enough
* to let someone see the entry
*/
pstore/platform: Disable automatic updates by default Having automatic updates seems pointless for production system, and even dangerous and thus counter-productive: 1. If we can mount pstore, or read files, we can as well read /proc/kmsg. So, there's little point in duplicating the functionality and present the same information but via another userland ABI; 2. Expecting the kernel to behave sanely after oops/panic is naive. It might work, but you'd rather not try it. Screwed up kernel can do rather bad things, like recursive faults[1]; and pstore rather provoking bad things to happen. It uses: 1. Timers (assumes sane interrupts state); 2. Workqueues and mutexes (assumes scheduler in a sane state); 3. kzalloc (a working slab allocator); That's too much for a dead kernel, so the debugging facility itself might just make debugging harder, which is not what we want. Maybe for non-oops message types it would make sense to re-enable automatic updates, but so far I don't see any use case for this. Even for tracing, it has its own run-time/normal ABI, so we're only interested in pstore upon next boot, to retrieve what has gone wrong with HW or SW. So, let's disable the updates by default. [1] BUG: unable to handle kernel paging request at fffffffffffffff8 IP: [<ffffffff8104801b>] kthread_data+0xb/0x20 [...] Process kworker/0:1 (pid: 14, threadinfo ffff8800072c0000, task ffff88000725b100) [... Call Trace: [<ffffffff81043710>] wq_worker_sleeping+0x10/0xa0 [<ffffffff813687a8>] __schedule+0x568/0x7d0 [<ffffffff8106c24d>] ? trace_hardirqs_on+0xd/0x10 [<ffffffff81087e22>] ? call_rcu_sched+0x12/0x20 [<ffffffff8102b596>] ? release_task+0x156/0x2d0 [<ffffffff8102b45e>] ? release_task+0x1e/0x2d0 [<ffffffff8106c24d>] ? trace_hardirqs_on+0xd/0x10 [<ffffffff81368ac4>] schedule+0x24/0x70 [<ffffffff8102cba8>] do_exit+0x1f8/0x370 [<ffffffff810051e7>] oops_end+0x77/0xb0 [<ffffffff8135c301>] no_context+0x1a6/0x1b5 [<ffffffff8135c4de>] __bad_area_nosemaphore+0x1ce/0x1ed [<ffffffff81053156>] ? ttwu_queue+0xc6/0xe0 [<ffffffff8135c50b>] bad_area_nosemaphore+0xe/0x10 [<ffffffff8101fa47>] do_page_fault+0x2c7/0x450 [<ffffffff8106e34b>] ? __lock_release+0x6b/0xe0 [<ffffffff8106bf21>] ? mark_held_locks+0x61/0x140 [<ffffffff810502fe>] ? __wake_up+0x4e/0x70 [<ffffffff81185f7d>] ? trace_hardirqs_off_thunk+0x3a/0x3c [<ffffffff81158970>] ? pstore_register+0x120/0x120 [<ffffffff8136a37f>] page_fault+0x1f/0x30 [<ffffffff81158970>] ? pstore_register+0x120/0x120 [<ffffffff81185ab8>] ? memcpy+0x68/0x110 [<ffffffff8115875a>] ? pstore_get_records+0x3a/0x130 [<ffffffff811590f4>] ? persistent_ram_copy_old+0x64/0x90 [<ffffffff81158bf4>] ramoops_pstore_read+0x84/0x130 [<ffffffff81158799>] pstore_get_records+0x79/0x130 [<ffffffff81042536>] ? process_one_work+0x116/0x450 [<ffffffff81158970>] ? pstore_register+0x120/0x120 [<ffffffff8115897e>] pstore_dowork+0xe/0x10 [<ffffffff81042594>] process_one_work+0x174/0x450 [<ffffffff81042536>] ? process_one_work+0x116/0x450 [<ffffffff81042e13>] worker_thread+0x123/0x2d0 [<ffffffff81042cf0>] ? manage_workers.isra.28+0x120/0x120 [<ffffffff81047d8e>] kthread+0x8e/0xa0 [<ffffffff8136ba74>] kernel_thread_helper+0x4/0x10 [<ffffffff8136a199>] ? retint_restore_args+0xe/0xe [<ffffffff81047d00>] ? __init_kthread_worker+0x70/0x70 [<ffffffff8136ba70>] ? gs_change+0xb/0xb Code: be e2 00 00 00 48 c7 c7 d1 2a 4e 81 e8 bf fb fd ff 48 8b 5d f0 4c 8b 65 f8 c9 c3 0f 1f 44 00 00 48 8b 87 08 02 00 00 55 48 89 e5 <48> 8b 40 f8 5d c3 66 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 RIP [<ffffffff8104801b>] kthread_data+0xb/0x20 RSP <ffff8800072c1888> CR2: fffffffffffffff8 ---[ end trace 996a332dc399111d ]--- Fixing recursive fault but reboot is needed! Signed-off-by: Anton Vorontsov <anton.vorontsov@linaro.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2012-05-26 21:20:29 +08:00
static int pstore_update_ms = -1;
module_param_named(update_ms, pstore_update_ms, int, 0600);
MODULE_PARM_DESC(update_ms, "milliseconds before pstore updates its content "
pstore/platform: Disable automatic updates by default Having automatic updates seems pointless for production system, and even dangerous and thus counter-productive: 1. If we can mount pstore, or read files, we can as well read /proc/kmsg. So, there's little point in duplicating the functionality and present the same information but via another userland ABI; 2. Expecting the kernel to behave sanely after oops/panic is naive. It might work, but you'd rather not try it. Screwed up kernel can do rather bad things, like recursive faults[1]; and pstore rather provoking bad things to happen. It uses: 1. Timers (assumes sane interrupts state); 2. Workqueues and mutexes (assumes scheduler in a sane state); 3. kzalloc (a working slab allocator); That's too much for a dead kernel, so the debugging facility itself might just make debugging harder, which is not what we want. Maybe for non-oops message types it would make sense to re-enable automatic updates, but so far I don't see any use case for this. Even for tracing, it has its own run-time/normal ABI, so we're only interested in pstore upon next boot, to retrieve what has gone wrong with HW or SW. So, let's disable the updates by default. [1] BUG: unable to handle kernel paging request at fffffffffffffff8 IP: [<ffffffff8104801b>] kthread_data+0xb/0x20 [...] Process kworker/0:1 (pid: 14, threadinfo ffff8800072c0000, task ffff88000725b100) [... Call Trace: [<ffffffff81043710>] wq_worker_sleeping+0x10/0xa0 [<ffffffff813687a8>] __schedule+0x568/0x7d0 [<ffffffff8106c24d>] ? trace_hardirqs_on+0xd/0x10 [<ffffffff81087e22>] ? call_rcu_sched+0x12/0x20 [<ffffffff8102b596>] ? release_task+0x156/0x2d0 [<ffffffff8102b45e>] ? release_task+0x1e/0x2d0 [<ffffffff8106c24d>] ? trace_hardirqs_on+0xd/0x10 [<ffffffff81368ac4>] schedule+0x24/0x70 [<ffffffff8102cba8>] do_exit+0x1f8/0x370 [<ffffffff810051e7>] oops_end+0x77/0xb0 [<ffffffff8135c301>] no_context+0x1a6/0x1b5 [<ffffffff8135c4de>] __bad_area_nosemaphore+0x1ce/0x1ed [<ffffffff81053156>] ? ttwu_queue+0xc6/0xe0 [<ffffffff8135c50b>] bad_area_nosemaphore+0xe/0x10 [<ffffffff8101fa47>] do_page_fault+0x2c7/0x450 [<ffffffff8106e34b>] ? __lock_release+0x6b/0xe0 [<ffffffff8106bf21>] ? mark_held_locks+0x61/0x140 [<ffffffff810502fe>] ? __wake_up+0x4e/0x70 [<ffffffff81185f7d>] ? trace_hardirqs_off_thunk+0x3a/0x3c [<ffffffff81158970>] ? pstore_register+0x120/0x120 [<ffffffff8136a37f>] page_fault+0x1f/0x30 [<ffffffff81158970>] ? pstore_register+0x120/0x120 [<ffffffff81185ab8>] ? memcpy+0x68/0x110 [<ffffffff8115875a>] ? pstore_get_records+0x3a/0x130 [<ffffffff811590f4>] ? persistent_ram_copy_old+0x64/0x90 [<ffffffff81158bf4>] ramoops_pstore_read+0x84/0x130 [<ffffffff81158799>] pstore_get_records+0x79/0x130 [<ffffffff81042536>] ? process_one_work+0x116/0x450 [<ffffffff81158970>] ? pstore_register+0x120/0x120 [<ffffffff8115897e>] pstore_dowork+0xe/0x10 [<ffffffff81042594>] process_one_work+0x174/0x450 [<ffffffff81042536>] ? process_one_work+0x116/0x450 [<ffffffff81042e13>] worker_thread+0x123/0x2d0 [<ffffffff81042cf0>] ? manage_workers.isra.28+0x120/0x120 [<ffffffff81047d8e>] kthread+0x8e/0xa0 [<ffffffff8136ba74>] kernel_thread_helper+0x4/0x10 [<ffffffff8136a199>] ? retint_restore_args+0xe/0xe [<ffffffff81047d00>] ? __init_kthread_worker+0x70/0x70 [<ffffffff8136ba70>] ? gs_change+0xb/0xb Code: be e2 00 00 00 48 c7 c7 d1 2a 4e 81 e8 bf fb fd ff 48 8b 5d f0 4c 8b 65 f8 c9 c3 0f 1f 44 00 00 48 8b 87 08 02 00 00 55 48 89 e5 <48> 8b 40 f8 5d c3 66 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 RIP [<ffffffff8104801b>] kthread_data+0xb/0x20 RSP <ffff8800072c1888> CR2: fffffffffffffff8 ---[ end trace 996a332dc399111d ]--- Fixing recursive fault but reboot is needed! Signed-off-by: Anton Vorontsov <anton.vorontsov@linaro.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2012-05-26 21:20:29 +08:00
"(default is -1, which means runtime updates are disabled; "
"enabling this option may not be safe; it may lead to further "
pstore/platform: Disable automatic updates by default Having automatic updates seems pointless for production system, and even dangerous and thus counter-productive: 1. If we can mount pstore, or read files, we can as well read /proc/kmsg. So, there's little point in duplicating the functionality and present the same information but via another userland ABI; 2. Expecting the kernel to behave sanely after oops/panic is naive. It might work, but you'd rather not try it. Screwed up kernel can do rather bad things, like recursive faults[1]; and pstore rather provoking bad things to happen. It uses: 1. Timers (assumes sane interrupts state); 2. Workqueues and mutexes (assumes scheduler in a sane state); 3. kzalloc (a working slab allocator); That's too much for a dead kernel, so the debugging facility itself might just make debugging harder, which is not what we want. Maybe for non-oops message types it would make sense to re-enable automatic updates, but so far I don't see any use case for this. Even for tracing, it has its own run-time/normal ABI, so we're only interested in pstore upon next boot, to retrieve what has gone wrong with HW or SW. So, let's disable the updates by default. [1] BUG: unable to handle kernel paging request at fffffffffffffff8 IP: [<ffffffff8104801b>] kthread_data+0xb/0x20 [...] Process kworker/0:1 (pid: 14, threadinfo ffff8800072c0000, task ffff88000725b100) [... Call Trace: [<ffffffff81043710>] wq_worker_sleeping+0x10/0xa0 [<ffffffff813687a8>] __schedule+0x568/0x7d0 [<ffffffff8106c24d>] ? trace_hardirqs_on+0xd/0x10 [<ffffffff81087e22>] ? call_rcu_sched+0x12/0x20 [<ffffffff8102b596>] ? release_task+0x156/0x2d0 [<ffffffff8102b45e>] ? release_task+0x1e/0x2d0 [<ffffffff8106c24d>] ? trace_hardirqs_on+0xd/0x10 [<ffffffff81368ac4>] schedule+0x24/0x70 [<ffffffff8102cba8>] do_exit+0x1f8/0x370 [<ffffffff810051e7>] oops_end+0x77/0xb0 [<ffffffff8135c301>] no_context+0x1a6/0x1b5 [<ffffffff8135c4de>] __bad_area_nosemaphore+0x1ce/0x1ed [<ffffffff81053156>] ? ttwu_queue+0xc6/0xe0 [<ffffffff8135c50b>] bad_area_nosemaphore+0xe/0x10 [<ffffffff8101fa47>] do_page_fault+0x2c7/0x450 [<ffffffff8106e34b>] ? __lock_release+0x6b/0xe0 [<ffffffff8106bf21>] ? mark_held_locks+0x61/0x140 [<ffffffff810502fe>] ? __wake_up+0x4e/0x70 [<ffffffff81185f7d>] ? trace_hardirqs_off_thunk+0x3a/0x3c [<ffffffff81158970>] ? pstore_register+0x120/0x120 [<ffffffff8136a37f>] page_fault+0x1f/0x30 [<ffffffff81158970>] ? pstore_register+0x120/0x120 [<ffffffff81185ab8>] ? memcpy+0x68/0x110 [<ffffffff8115875a>] ? pstore_get_records+0x3a/0x130 [<ffffffff811590f4>] ? persistent_ram_copy_old+0x64/0x90 [<ffffffff81158bf4>] ramoops_pstore_read+0x84/0x130 [<ffffffff81158799>] pstore_get_records+0x79/0x130 [<ffffffff81042536>] ? process_one_work+0x116/0x450 [<ffffffff81158970>] ? pstore_register+0x120/0x120 [<ffffffff8115897e>] pstore_dowork+0xe/0x10 [<ffffffff81042594>] process_one_work+0x174/0x450 [<ffffffff81042536>] ? process_one_work+0x116/0x450 [<ffffffff81042e13>] worker_thread+0x123/0x2d0 [<ffffffff81042cf0>] ? manage_workers.isra.28+0x120/0x120 [<ffffffff81047d8e>] kthread+0x8e/0xa0 [<ffffffff8136ba74>] kernel_thread_helper+0x4/0x10 [<ffffffff8136a199>] ? retint_restore_args+0xe/0xe [<ffffffff81047d00>] ? __init_kthread_worker+0x70/0x70 [<ffffffff8136ba70>] ? gs_change+0xb/0xb Code: be e2 00 00 00 48 c7 c7 d1 2a 4e 81 e8 bf fb fd ff 48 8b 5d f0 4c 8b 65 f8 c9 c3 0f 1f 44 00 00 48 8b 87 08 02 00 00 55 48 89 e5 <48> 8b 40 f8 5d c3 66 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 RIP [<ffffffff8104801b>] kthread_data+0xb/0x20 RSP <ffff8800072c1888> CR2: fffffffffffffff8 ---[ end trace 996a332dc399111d ]--- Fixing recursive fault but reboot is needed! Signed-off-by: Anton Vorontsov <anton.vorontsov@linaro.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2012-05-26 21:20:29 +08:00
"corruption on Oopses)");
/* Names should be in the same order as the enum pstore_type_id */
static const char * const pstore_type_names[] = {
"dmesg",
"mce",
"console",
"ftrace",
"rtas",
"powerpc-ofw",
"powerpc-common",
"pmsg",
"powerpc-opal",
};
static int pstore_new_entry;
static void pstore_timefunc(struct timer_list *);
timer: Remove expires and data arguments from DEFINE_TIMER Drop the arguments from the macro and adjust all callers with the following script: perl -pi -e 's/DEFINE_TIMER\((.*), 0, 0\);/DEFINE_TIMER($1);/g;' \ $(git grep DEFINE_TIMER | cut -d: -f1 | sort -u | grep -v timer.h) Signed-off-by: Kees Cook <keescook@chromium.org> Acked-by: Geert Uytterhoeven <geert@linux-m68k.org> # for m68k parts Acked-by: Guenter Roeck <linux@roeck-us.net> # for watchdog parts Acked-by: David S. Miller <davem@davemloft.net> # for networking parts Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Acked-by: Kalle Valo <kvalo@codeaurora.org> # for wireless parts Acked-by: Arnd Bergmann <arnd@arndb.de> Cc: linux-mips@linux-mips.org Cc: Petr Mladek <pmladek@suse.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Lai Jiangshan <jiangshanlai@gmail.com> Cc: Sebastian Reichel <sre@kernel.org> Cc: Kalle Valo <kvalo@qca.qualcomm.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Pavel Machek <pavel@ucw.cz> Cc: linux1394-devel@lists.sourceforge.net Cc: Chris Metcalf <cmetcalf@mellanox.com> Cc: linux-s390@vger.kernel.org Cc: linux-wireless@vger.kernel.org Cc: "James E.J. Bottomley" <jejb@linux.vnet.ibm.com> Cc: Wim Van Sebroeck <wim@iguana.be> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Ursula Braun <ubraun@linux.vnet.ibm.com> Cc: Viresh Kumar <viresh.kumar@linaro.org> Cc: Harish Patil <harish.patil@cavium.com> Cc: Stephen Boyd <sboyd@codeaurora.org> Cc: Michael Reed <mdr@sgi.com> Cc: Manish Chopra <manish.chopra@cavium.com> Cc: Len Brown <len.brown@intel.com> Cc: Arnd Bergmann <arnd@arndb.de> Cc: linux-pm@vger.kernel.org Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Tejun Heo <tj@kernel.org> Cc: Julian Wiedmann <jwi@linux.vnet.ibm.com> Cc: John Stultz <john.stultz@linaro.org> Cc: Mark Gross <mark.gross@intel.com> Cc: linux-watchdog@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: "Martin K. Petersen" <martin.petersen@oracle.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Stefan Richter <stefanr@s5r6.in-berlin.de> Cc: Guenter Roeck <linux@roeck-us.net> Cc: netdev@vger.kernel.org Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: linuxppc-dev@lists.ozlabs.org Cc: Sudip Mukherjee <sudipm.mukherjee@gmail.com> Link: https://lkml.kernel.org/r/1507159627-127660-11-git-send-email-keescook@chromium.org Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2017-10-05 07:27:04 +08:00
static DEFINE_TIMER(pstore_timer, pstore_timefunc);
static void pstore_dowork(struct work_struct *);
static DECLARE_WORK(pstore_work, pstore_dowork);
/*
* psinfo_lock protects "psinfo" during calls to
* pstore_register(), pstore_unregister(), and
* the filesystem mount/unmount routines.
*/
static DEFINE_MUTEX(psinfo_lock);
struct pstore_info *psinfo;
static char *backend;
module_param(backend, charp, 0444);
MODULE_PARM_DESC(backend, "specific backend to use");
static char *compress =
#ifdef CONFIG_PSTORE_COMPRESS_DEFAULT
CONFIG_PSTORE_COMPRESS_DEFAULT;
#else
NULL;
#endif
module_param(compress, charp, 0444);
MODULE_PARM_DESC(compress, "compression to use");
/* Compression parameters */
static struct crypto_comp *tfm;
struct pstore_zbackend {
int (*zbufsize)(size_t size);
const char *name;
};
static char *big_oops_buf;
static size_t big_oops_buf_sz;
/* How much of the console log to snapshot */
unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES;
void pstore_set_kmsg_bytes(int bytes)
{
kmsg_bytes = bytes;
}
/* Tag each group of saved records with a sequence number */
static int oopscount;
const char *pstore_type_to_name(enum pstore_type_id type)
{
BUILD_BUG_ON(ARRAY_SIZE(pstore_type_names) != PSTORE_TYPE_MAX);
if (WARN_ON_ONCE(type >= PSTORE_TYPE_MAX))
return "unknown";
return pstore_type_names[type];
}
EXPORT_SYMBOL_GPL(pstore_type_to_name);
enum pstore_type_id pstore_name_to_type(const char *name)
{
int i;
for (i = 0; i < PSTORE_TYPE_MAX; i++) {
if (!strcmp(pstore_type_names[i], name))
return i;
}
return PSTORE_TYPE_MAX;
}
EXPORT_SYMBOL_GPL(pstore_name_to_type);
static void pstore_timer_kick(void)
{
if (pstore_update_ms < 0)
return;
mod_timer(&pstore_timer, jiffies + msecs_to_jiffies(pstore_update_ms));
}
pstore: Don't use semaphores in always-atomic-context code pstore_dump() is *always* invoked in atomic context (nowadays in an RCU read-side critical section, before that under a spinlock). It doesn't make sense to try to use semaphores here. This is mostly a revert of commit ea84b580b955 ("pstore: Convert buf_lock to semaphore"), except that two parts aren't restored back exactly as they were: - keep the lock initialization in pstore_register - in efi_pstore_write(), always set the "block" flag to false - omit "is_locked", that was unnecessary since commit 959217c84c27 ("pstore: Actually give up during locking failure") - fix the bailout message The actual problem that the buggy commit was trying to address may have been that the use of preemptible() in efi_pstore_write() was wrong - it only looks at preempt_count() and the state of IRQs, but __rcu_read_lock() doesn't touch either of those under CONFIG_PREEMPT_RCU. (Sidenote: CONFIG_PREEMPT_RCU means that the scheduler can preempt tasks in RCU read-side critical sections, but you're not allowed to actively block/reschedule.) Lockdep probably never caught the problem because it's very rare that you actually hit the contended case, so lockdep always just sees the down_trylock(), not the down_interruptible(), and so it can't tell that there's a problem. Fixes: ea84b580b955 ("pstore: Convert buf_lock to semaphore") Cc: stable@vger.kernel.org Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Signed-off-by: Jann Horn <jannh@google.com> Signed-off-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/r/20220314185953.2068993-1-jannh@google.com
2022-03-15 02:59:53 +08:00
static bool pstore_cannot_block_path(enum kmsg_dump_reason reason)
pstore: Avoid deadlock in panic and emergency-restart path [Issue] When pstore is in panic and emergency-restart paths, it may be blocked in those paths because it simply takes spin_lock. This is an example scenario which pstore may hang up in a panic path: - cpuA grabs psinfo->buf_lock - cpuB panics and calls smp_send_stop - smp_send_stop sends IRQ to cpuA - after 1 second, cpuB gives up on cpuA and sends an NMI instead - cpuA is now in an NMI handler while still holding buf_lock - cpuB is deadlocked This case may happen if a firmware has a bug and cpuA is stuck talking with it more than one second. Also, this is a similar scenario in an emergency-restart path: - cpuA grabs psinfo->buf_lock and stucks in a firmware - cpuB kicks emergency-restart via either sysrq-b or hangcheck timer. And then, cpuB is deadlocked by taking psinfo->buf_lock again. [Solution] This patch avoids the deadlocking issues in both panic and emergency_restart paths by introducing a function, is_non_blocking_path(), to check if a cpu can be blocked in current path. With this patch, pstore is not blocked even if another cpu has taken a spin_lock, in those paths by changing from spin_lock_irqsave to spin_trylock_irqsave. In addition, according to a comment of emergency_restart() in kernel/sys.c, spin_lock shouldn't be taken in an emergency_restart path to avoid deadlock. This patch fits the comment below. <snip> /** * emergency_restart - reboot the system * * Without shutting down any hardware or taking any locks * reboot the system. This is called when we know we are in * trouble so this is our best effort to reboot. This is * safe to call in interrupt context. */ void emergency_restart(void) <snip> Signed-off-by: Seiji Aguchi <seiji.aguchi@hds.com> Acked-by: Don Zickus <dzickus@redhat.com> Signed-off-by: Tony Luck <tony.luck@intel.com>
2013-01-12 02:09:41 +08:00
{
pstore: Don't use semaphores in always-atomic-context code pstore_dump() is *always* invoked in atomic context (nowadays in an RCU read-side critical section, before that under a spinlock). It doesn't make sense to try to use semaphores here. This is mostly a revert of commit ea84b580b955 ("pstore: Convert buf_lock to semaphore"), except that two parts aren't restored back exactly as they were: - keep the lock initialization in pstore_register - in efi_pstore_write(), always set the "block" flag to false - omit "is_locked", that was unnecessary since commit 959217c84c27 ("pstore: Actually give up during locking failure") - fix the bailout message The actual problem that the buggy commit was trying to address may have been that the use of preemptible() in efi_pstore_write() was wrong - it only looks at preempt_count() and the state of IRQs, but __rcu_read_lock() doesn't touch either of those under CONFIG_PREEMPT_RCU. (Sidenote: CONFIG_PREEMPT_RCU means that the scheduler can preempt tasks in RCU read-side critical sections, but you're not allowed to actively block/reschedule.) Lockdep probably never caught the problem because it's very rare that you actually hit the contended case, so lockdep always just sees the down_trylock(), not the down_interruptible(), and so it can't tell that there's a problem. Fixes: ea84b580b955 ("pstore: Convert buf_lock to semaphore") Cc: stable@vger.kernel.org Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Signed-off-by: Jann Horn <jannh@google.com> Signed-off-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/r/20220314185953.2068993-1-jannh@google.com
2022-03-15 02:59:53 +08:00
/*
* In case of NMI path, pstore shouldn't be blocked
* regardless of reason.
*/
pstore: Avoid deadlock in panic and emergency-restart path [Issue] When pstore is in panic and emergency-restart paths, it may be blocked in those paths because it simply takes spin_lock. This is an example scenario which pstore may hang up in a panic path: - cpuA grabs psinfo->buf_lock - cpuB panics and calls smp_send_stop - smp_send_stop sends IRQ to cpuA - after 1 second, cpuB gives up on cpuA and sends an NMI instead - cpuA is now in an NMI handler while still holding buf_lock - cpuB is deadlocked This case may happen if a firmware has a bug and cpuA is stuck talking with it more than one second. Also, this is a similar scenario in an emergency-restart path: - cpuA grabs psinfo->buf_lock and stucks in a firmware - cpuB kicks emergency-restart via either sysrq-b or hangcheck timer. And then, cpuB is deadlocked by taking psinfo->buf_lock again. [Solution] This patch avoids the deadlocking issues in both panic and emergency_restart paths by introducing a function, is_non_blocking_path(), to check if a cpu can be blocked in current path. With this patch, pstore is not blocked even if another cpu has taken a spin_lock, in those paths by changing from spin_lock_irqsave to spin_trylock_irqsave. In addition, according to a comment of emergency_restart() in kernel/sys.c, spin_lock shouldn't be taken in an emergency_restart path to avoid deadlock. This patch fits the comment below. <snip> /** * emergency_restart - reboot the system * * Without shutting down any hardware or taking any locks * reboot the system. This is called when we know we are in * trouble so this is our best effort to reboot. This is * safe to call in interrupt context. */ void emergency_restart(void) <snip> Signed-off-by: Seiji Aguchi <seiji.aguchi@hds.com> Acked-by: Don Zickus <dzickus@redhat.com> Signed-off-by: Tony Luck <tony.luck@intel.com>
2013-01-12 02:09:41 +08:00
if (in_nmi())
return true;
switch (reason) {
/* In panic case, other cpus are stopped by smp_send_stop(). */
case KMSG_DUMP_PANIC:
pstore: Don't use semaphores in always-atomic-context code pstore_dump() is *always* invoked in atomic context (nowadays in an RCU read-side critical section, before that under a spinlock). It doesn't make sense to try to use semaphores here. This is mostly a revert of commit ea84b580b955 ("pstore: Convert buf_lock to semaphore"), except that two parts aren't restored back exactly as they were: - keep the lock initialization in pstore_register - in efi_pstore_write(), always set the "block" flag to false - omit "is_locked", that was unnecessary since commit 959217c84c27 ("pstore: Actually give up during locking failure") - fix the bailout message The actual problem that the buggy commit was trying to address may have been that the use of preemptible() in efi_pstore_write() was wrong - it only looks at preempt_count() and the state of IRQs, but __rcu_read_lock() doesn't touch either of those under CONFIG_PREEMPT_RCU. (Sidenote: CONFIG_PREEMPT_RCU means that the scheduler can preempt tasks in RCU read-side critical sections, but you're not allowed to actively block/reschedule.) Lockdep probably never caught the problem because it's very rare that you actually hit the contended case, so lockdep always just sees the down_trylock(), not the down_interruptible(), and so it can't tell that there's a problem. Fixes: ea84b580b955 ("pstore: Convert buf_lock to semaphore") Cc: stable@vger.kernel.org Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Signed-off-by: Jann Horn <jannh@google.com> Signed-off-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/r/20220314185953.2068993-1-jannh@google.com
2022-03-15 02:59:53 +08:00
/*
* Emergency restart shouldn't be blocked by spinning on
* pstore_info::buf_lock.
*/
pstore: Avoid deadlock in panic and emergency-restart path [Issue] When pstore is in panic and emergency-restart paths, it may be blocked in those paths because it simply takes spin_lock. This is an example scenario which pstore may hang up in a panic path: - cpuA grabs psinfo->buf_lock - cpuB panics and calls smp_send_stop - smp_send_stop sends IRQ to cpuA - after 1 second, cpuB gives up on cpuA and sends an NMI instead - cpuA is now in an NMI handler while still holding buf_lock - cpuB is deadlocked This case may happen if a firmware has a bug and cpuA is stuck talking with it more than one second. Also, this is a similar scenario in an emergency-restart path: - cpuA grabs psinfo->buf_lock and stucks in a firmware - cpuB kicks emergency-restart via either sysrq-b or hangcheck timer. And then, cpuB is deadlocked by taking psinfo->buf_lock again. [Solution] This patch avoids the deadlocking issues in both panic and emergency_restart paths by introducing a function, is_non_blocking_path(), to check if a cpu can be blocked in current path. With this patch, pstore is not blocked even if another cpu has taken a spin_lock, in those paths by changing from spin_lock_irqsave to spin_trylock_irqsave. In addition, according to a comment of emergency_restart() in kernel/sys.c, spin_lock shouldn't be taken in an emergency_restart path to avoid deadlock. This patch fits the comment below. <snip> /** * emergency_restart - reboot the system * * Without shutting down any hardware or taking any locks * reboot the system. This is called when we know we are in * trouble so this is our best effort to reboot. This is * safe to call in interrupt context. */ void emergency_restart(void) <snip> Signed-off-by: Seiji Aguchi <seiji.aguchi@hds.com> Acked-by: Don Zickus <dzickus@redhat.com> Signed-off-by: Tony Luck <tony.luck@intel.com>
2013-01-12 02:09:41 +08:00
case KMSG_DUMP_EMERG:
return true;
default:
return false;
}
}
2018-03-15 23:34:08 +08:00
#if IS_ENABLED(CONFIG_PSTORE_DEFLATE_COMPRESS)
static int zbufsize_deflate(size_t size)
{
size_t cmpr;
switch (size) {
/* buffer range for efivars */
case 1000 ... 2000:
cmpr = 56;
break;
case 2001 ... 3000:
cmpr = 54;
break;
case 3001 ... 3999:
cmpr = 52;
break;
/* buffer range for nvram, erst */
case 4000 ... 10000:
cmpr = 45;
break;
default:
cmpr = 60;
break;
}
return (size * 100) / cmpr;
}
#endif
2018-03-15 23:34:08 +08:00
#if IS_ENABLED(CONFIG_PSTORE_LZO_COMPRESS)
static int zbufsize_lzo(size_t size)
{
return lzo1x_worst_compress(size);
}
#endif
2018-03-15 23:34:08 +08:00
#if IS_ENABLED(CONFIG_PSTORE_LZ4_COMPRESS) || IS_ENABLED(CONFIG_PSTORE_LZ4HC_COMPRESS)
static int zbufsize_lz4(size_t size)
{
return LZ4_compressBound(size);
}
#endif
2018-03-15 23:34:08 +08:00
#if IS_ENABLED(CONFIG_PSTORE_842_COMPRESS)
static int zbufsize_842(size_t size)
{
return size;
}
#endif
#if IS_ENABLED(CONFIG_PSTORE_ZSTD_COMPRESS)
static int zbufsize_zstd(size_t size)
{
return zstd_compress_bound(size);
}
#endif
static const struct pstore_zbackend *zbackend __ro_after_init;
static const struct pstore_zbackend zbackends[] = {
2018-03-15 23:34:08 +08:00
#if IS_ENABLED(CONFIG_PSTORE_DEFLATE_COMPRESS)
{
.zbufsize = zbufsize_deflate,
.name = "deflate",
},
#endif
2018-03-15 23:34:08 +08:00
#if IS_ENABLED(CONFIG_PSTORE_LZO_COMPRESS)
{
.zbufsize = zbufsize_lzo,
.name = "lzo",
},
#endif
2018-03-15 23:34:08 +08:00
#if IS_ENABLED(CONFIG_PSTORE_LZ4_COMPRESS)
{
.zbufsize = zbufsize_lz4,
.name = "lz4",
},
#endif
2018-03-15 23:34:08 +08:00
#if IS_ENABLED(CONFIG_PSTORE_LZ4HC_COMPRESS)
{
.zbufsize = zbufsize_lz4,
.name = "lz4hc",
},
#endif
2018-03-15 23:34:08 +08:00
#if IS_ENABLED(CONFIG_PSTORE_842_COMPRESS)
{
.zbufsize = zbufsize_842,
.name = "842",
},
#endif
#if IS_ENABLED(CONFIG_PSTORE_ZSTD_COMPRESS)
{
.zbufsize = zbufsize_zstd,
.name = "zstd",
},
#endif
{ }
};
static int pstore_compress(const void *in, void *out,
unsigned int inlen, unsigned int outlen)
{
int ret;
if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS))
return -EINVAL;
ret = crypto_comp_compress(tfm, in, inlen, out, &outlen);
if (ret) {
pr_err("crypto_comp_compress failed, ret = %d!\n", ret);
return ret;
}
return outlen;
}
static void allocate_buf_for_compression(void)
{
struct crypto_comp *ctx;
int size;
char *buf;
/* Skip if not built-in or compression backend not selected yet. */
if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !zbackend)
return;
/* Skip if no pstore backend yet or compression init already done. */
if (!psinfo || tfm)
return;
if (!crypto_has_comp(zbackend->name, 0, 0)) {
pr_err("Unknown compression: %s\n", zbackend->name);
return;
}
size = zbackend->zbufsize(psinfo->bufsize);
if (size <= 0) {
pr_err("Invalid compression size for %s: %d\n",
zbackend->name, size);
return;
}
buf = kmalloc(size, GFP_KERNEL);
if (!buf) {
pr_err("Failed %d byte compression buffer allocation for: %s\n",
size, zbackend->name);
return;
}
ctx = crypto_alloc_comp(zbackend->name, 0, 0);
if (IS_ERR_OR_NULL(ctx)) {
kfree(buf);
pr_err("crypto_alloc_comp('%s') failed: %ld\n", zbackend->name,
PTR_ERR(ctx));
return;
}
/* A non-NULL big_oops_buf indicates compression is available. */
tfm = ctx;
big_oops_buf_sz = size;
big_oops_buf = buf;
pr_info("Using crash dump compression: %s\n", zbackend->name);
}
static void free_buf_for_compression(void)
{
if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && tfm) {
crypto_free_comp(tfm);
tfm = NULL;
}
kfree(big_oops_buf);
big_oops_buf = NULL;
big_oops_buf_sz = 0;
}
/*
* Called when compression fails, since the printk buffer
* would be fetched for compression calling it again when
* compression fails would have moved the iterator of
* printk buffer which results in fetching old contents.
* Copy the recent messages from big_oops_buf to psinfo->buf
*/
static size_t copy_kmsg_to_buffer(int hsize, size_t len)
{
size_t total_len;
size_t diff;
total_len = hsize + len;
if (total_len > psinfo->bufsize) {
diff = total_len - psinfo->bufsize + hsize;
memcpy(psinfo->buf, big_oops_buf, hsize);
memcpy(psinfo->buf + hsize, big_oops_buf + diff,
psinfo->bufsize - hsize);
total_len = psinfo->bufsize;
} else
memcpy(psinfo->buf, big_oops_buf, total_len);
return total_len;
}
void pstore_record_init(struct pstore_record *record,
struct pstore_info *psinfo)
{
memset(record, 0, sizeof(*record));
record->psi = psinfo;
/* Report zeroed timestamp if called before timekeeping has resumed. */
record->time = ns_to_timespec64(ktime_get_real_fast_ns());
}
/*
* callback from kmsg_dump. Save as much as we can (up to kmsg_bytes) from the
* end of the buffer.
*/
static void pstore_dump(struct kmsg_dumper *dumper,
enum kmsg_dump_reason reason)
{
struct kmsg_dump_iter iter;
unsigned long total = 0;
const char *why;
unsigned int part = 1;
pstore: Don't use semaphores in always-atomic-context code pstore_dump() is *always* invoked in atomic context (nowadays in an RCU read-side critical section, before that under a spinlock). It doesn't make sense to try to use semaphores here. This is mostly a revert of commit ea84b580b955 ("pstore: Convert buf_lock to semaphore"), except that two parts aren't restored back exactly as they were: - keep the lock initialization in pstore_register - in efi_pstore_write(), always set the "block" flag to false - omit "is_locked", that was unnecessary since commit 959217c84c27 ("pstore: Actually give up during locking failure") - fix the bailout message The actual problem that the buggy commit was trying to address may have been that the use of preemptible() in efi_pstore_write() was wrong - it only looks at preempt_count() and the state of IRQs, but __rcu_read_lock() doesn't touch either of those under CONFIG_PREEMPT_RCU. (Sidenote: CONFIG_PREEMPT_RCU means that the scheduler can preempt tasks in RCU read-side critical sections, but you're not allowed to actively block/reschedule.) Lockdep probably never caught the problem because it's very rare that you actually hit the contended case, so lockdep always just sees the down_trylock(), not the down_interruptible(), and so it can't tell that there's a problem. Fixes: ea84b580b955 ("pstore: Convert buf_lock to semaphore") Cc: stable@vger.kernel.org Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Signed-off-by: Jann Horn <jannh@google.com> Signed-off-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/r/20220314185953.2068993-1-jannh@google.com
2022-03-15 02:59:53 +08:00
unsigned long flags = 0;
int ret;
why = kmsg_dump_reason_str(reason);
pstore: Don't use semaphores in always-atomic-context code pstore_dump() is *always* invoked in atomic context (nowadays in an RCU read-side critical section, before that under a spinlock). It doesn't make sense to try to use semaphores here. This is mostly a revert of commit ea84b580b955 ("pstore: Convert buf_lock to semaphore"), except that two parts aren't restored back exactly as they were: - keep the lock initialization in pstore_register - in efi_pstore_write(), always set the "block" flag to false - omit "is_locked", that was unnecessary since commit 959217c84c27 ("pstore: Actually give up during locking failure") - fix the bailout message The actual problem that the buggy commit was trying to address may have been that the use of preemptible() in efi_pstore_write() was wrong - it only looks at preempt_count() and the state of IRQs, but __rcu_read_lock() doesn't touch either of those under CONFIG_PREEMPT_RCU. (Sidenote: CONFIG_PREEMPT_RCU means that the scheduler can preempt tasks in RCU read-side critical sections, but you're not allowed to actively block/reschedule.) Lockdep probably never caught the problem because it's very rare that you actually hit the contended case, so lockdep always just sees the down_trylock(), not the down_interruptible(), and so it can't tell that there's a problem. Fixes: ea84b580b955 ("pstore: Convert buf_lock to semaphore") Cc: stable@vger.kernel.org Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Signed-off-by: Jann Horn <jannh@google.com> Signed-off-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/r/20220314185953.2068993-1-jannh@google.com
2022-03-15 02:59:53 +08:00
if (pstore_cannot_block_path(reason)) {
if (!spin_trylock_irqsave(&psinfo->buf_lock, flags)) {
pr_err("dump skipped in %s path because of concurrent dump\n",
in_nmi() ? "NMI" : why);
return;
pstore: Avoid deadlock in panic and emergency-restart path [Issue] When pstore is in panic and emergency-restart paths, it may be blocked in those paths because it simply takes spin_lock. This is an example scenario which pstore may hang up in a panic path: - cpuA grabs psinfo->buf_lock - cpuB panics and calls smp_send_stop - smp_send_stop sends IRQ to cpuA - after 1 second, cpuB gives up on cpuA and sends an NMI instead - cpuA is now in an NMI handler while still holding buf_lock - cpuB is deadlocked This case may happen if a firmware has a bug and cpuA is stuck talking with it more than one second. Also, this is a similar scenario in an emergency-restart path: - cpuA grabs psinfo->buf_lock and stucks in a firmware - cpuB kicks emergency-restart via either sysrq-b or hangcheck timer. And then, cpuB is deadlocked by taking psinfo->buf_lock again. [Solution] This patch avoids the deadlocking issues in both panic and emergency_restart paths by introducing a function, is_non_blocking_path(), to check if a cpu can be blocked in current path. With this patch, pstore is not blocked even if another cpu has taken a spin_lock, in those paths by changing from spin_lock_irqsave to spin_trylock_irqsave. In addition, according to a comment of emergency_restart() in kernel/sys.c, spin_lock shouldn't be taken in an emergency_restart path to avoid deadlock. This patch fits the comment below. <snip> /** * emergency_restart - reboot the system * * Without shutting down any hardware or taking any locks * reboot the system. This is called when we know we are in * trouble so this is our best effort to reboot. This is * safe to call in interrupt context. */ void emergency_restart(void) <snip> Signed-off-by: Seiji Aguchi <seiji.aguchi@hds.com> Acked-by: Don Zickus <dzickus@redhat.com> Signed-off-by: Tony Luck <tony.luck@intel.com>
2013-01-12 02:09:41 +08:00
}
pstore: Don't use semaphores in always-atomic-context code pstore_dump() is *always* invoked in atomic context (nowadays in an RCU read-side critical section, before that under a spinlock). It doesn't make sense to try to use semaphores here. This is mostly a revert of commit ea84b580b955 ("pstore: Convert buf_lock to semaphore"), except that two parts aren't restored back exactly as they were: - keep the lock initialization in pstore_register - in efi_pstore_write(), always set the "block" flag to false - omit "is_locked", that was unnecessary since commit 959217c84c27 ("pstore: Actually give up during locking failure") - fix the bailout message The actual problem that the buggy commit was trying to address may have been that the use of preemptible() in efi_pstore_write() was wrong - it only looks at preempt_count() and the state of IRQs, but __rcu_read_lock() doesn't touch either of those under CONFIG_PREEMPT_RCU. (Sidenote: CONFIG_PREEMPT_RCU means that the scheduler can preempt tasks in RCU read-side critical sections, but you're not allowed to actively block/reschedule.) Lockdep probably never caught the problem because it's very rare that you actually hit the contended case, so lockdep always just sees the down_trylock(), not the down_interruptible(), and so it can't tell that there's a problem. Fixes: ea84b580b955 ("pstore: Convert buf_lock to semaphore") Cc: stable@vger.kernel.org Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Signed-off-by: Jann Horn <jannh@google.com> Signed-off-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/r/20220314185953.2068993-1-jannh@google.com
2022-03-15 02:59:53 +08:00
} else {
spin_lock_irqsave(&psinfo->buf_lock, flags);
}
kmsg_dump_rewind(&iter);
oopscount++;
while (total < kmsg_bytes) {
char *dst;
size_t dst_size;
int header_size;
int zipped_len = -1;
size_t dump_size;
struct pstore_record record;
pstore_record_init(&record, psinfo);
record.type = PSTORE_TYPE_DMESG;
record.count = oopscount;
record.reason = reason;
record.part = part;
record.buf = psinfo->buf;
if (big_oops_buf) {
dst = big_oops_buf;
dst_size = big_oops_buf_sz;
} else {
dst = psinfo->buf;
dst_size = psinfo->bufsize;
}
/* Write dump header. */
header_size = snprintf(dst, dst_size, "%s#%d Part%u\n", why,
oopscount, part);
dst_size -= header_size;
/* Write dump contents. */
if (!kmsg_dump_get_buffer(&iter, true, dst + header_size,
dst_size, &dump_size))
break;
if (big_oops_buf) {
zipped_len = pstore_compress(dst, psinfo->buf,
header_size + dump_size,
psinfo->bufsize);
if (zipped_len > 0) {
record.compressed = true;
record.size = zipped_len;
} else {
record.size = copy_kmsg_to_buffer(header_size,
dump_size);
}
} else {
record.size = header_size + dump_size;
}
ret = psinfo->write(&record);
if (ret == 0 && reason == KMSG_DUMP_OOPS) {
pstore_new_entry = 1;
pstore_timer_kick();
}
total += record.size;
part++;
}
pstore: Don't use semaphores in always-atomic-context code pstore_dump() is *always* invoked in atomic context (nowadays in an RCU read-side critical section, before that under a spinlock). It doesn't make sense to try to use semaphores here. This is mostly a revert of commit ea84b580b955 ("pstore: Convert buf_lock to semaphore"), except that two parts aren't restored back exactly as they were: - keep the lock initialization in pstore_register - in efi_pstore_write(), always set the "block" flag to false - omit "is_locked", that was unnecessary since commit 959217c84c27 ("pstore: Actually give up during locking failure") - fix the bailout message The actual problem that the buggy commit was trying to address may have been that the use of preemptible() in efi_pstore_write() was wrong - it only looks at preempt_count() and the state of IRQs, but __rcu_read_lock() doesn't touch either of those under CONFIG_PREEMPT_RCU. (Sidenote: CONFIG_PREEMPT_RCU means that the scheduler can preempt tasks in RCU read-side critical sections, but you're not allowed to actively block/reschedule.) Lockdep probably never caught the problem because it's very rare that you actually hit the contended case, so lockdep always just sees the down_trylock(), not the down_interruptible(), and so it can't tell that there's a problem. Fixes: ea84b580b955 ("pstore: Convert buf_lock to semaphore") Cc: stable@vger.kernel.org Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Signed-off-by: Jann Horn <jannh@google.com> Signed-off-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/r/20220314185953.2068993-1-jannh@google.com
2022-03-15 02:59:53 +08:00
spin_unlock_irqrestore(&psinfo->buf_lock, flags);
}
static struct kmsg_dumper pstore_dumper = {
.dump = pstore_dump,
};
/*
* Register with kmsg_dump to save last part of console log on panic.
*/
static void pstore_register_kmsg(void)
{
kmsg_dump_register(&pstore_dumper);
}
static void pstore_unregister_kmsg(void)
{
kmsg_dump_unregister(&pstore_dumper);
}
#ifdef CONFIG_PSTORE_CONSOLE
static void pstore_console_write(struct console *con, const char *s, unsigned c)
{
struct pstore_record record;
if (!c)
return;
pstore_record_init(&record, psinfo);
record.type = PSTORE_TYPE_CONSOLE;
record.buf = (char *)s;
record.size = c;
psinfo->write(&record);
}
static struct console pstore_console = {
.write = pstore_console_write,
.index = -1,
};
static void pstore_register_console(void)
{
/* Show which backend is going to get console writes. */
strscpy(pstore_console.name, psinfo->name,
sizeof(pstore_console.name));
/*
* Always initialize flags here since prior unregister_console()
* calls may have changed settings (specifically CON_ENABLED).
*/
pstore_console.flags = CON_PRINTBUFFER | CON_ENABLED | CON_ANYTIME;
register_console(&pstore_console);
}
static void pstore_unregister_console(void)
{
unregister_console(&pstore_console);
}
#else
static void pstore_register_console(void) {}
static void pstore_unregister_console(void) {}
#endif
static int pstore_write_user_compat(struct pstore_record *record,
const char __user *buf)
{
int ret = 0;
if (record->buf)
return -EINVAL;
record->buf = memdup_user(buf, record->size);
if (IS_ERR(record->buf)) {
ret = PTR_ERR(record->buf);
goto out;
}
ret = record->psi->write(record);
kfree(record->buf);
out:
record->buf = NULL;
return unlikely(ret < 0) ? ret : record->size;
}
/*
* platform specific persistent storage driver registers with
* us here. If pstore is already mounted, call the platform
* read function right away to populate the file system. If not
* then the pstore mount code will call us later to fill out
* the file system.
*/
int pstore_register(struct pstore_info *psi)
{
if (backend && strcmp(backend, psi->name)) {
pr_warn("ignoring unexpected backend '%s'\n", psi->name);
return -EPERM;
}
/* Sanity check flags. */
if (!psi->flags) {
pr_warn("backend '%s' must support at least one frontend\n",
psi->name);
return -EINVAL;
}
/* Check for required functions. */
if (!psi->read || !psi->write) {
pr_warn("backend '%s' must implement read() and write()\n",
psi->name);
return -EINVAL;
}
mutex_lock(&psinfo_lock);
if (psinfo) {
pr_warn("backend '%s' already loaded: ignoring '%s'\n",
psinfo->name, psi->name);
mutex_unlock(&psinfo_lock);
return -EBUSY;
}
if (!psi->write_user)
psi->write_user = pstore_write_user_compat;
psinfo = psi;
pstore: pass allocated memory region back to caller The buf_lock cannot be held while populating the inodes, so make the backend pass forward an allocated and filled buffer instead. This solves the following backtrace. The effect is that "buf" is only ever used to notify the backends that something was written to it, and shouldn't be used in the read path. To replace the buf_lock during the read path, isolate the open/read/close loop with a separate mutex to maintain serialized access to the backend. Note that is is up to the pstore backend to cope if the (*write)() path is called in the middle of the read path. [ 59.691019] BUG: sleeping function called from invalid context at .../mm/slub.c:847 [ 59.691019] in_atomic(): 0, irqs_disabled(): 1, pid: 1819, name: mount [ 59.691019] Pid: 1819, comm: mount Not tainted 3.0.8 #1 [ 59.691019] Call Trace: [ 59.691019] [<810252d5>] __might_sleep+0xc3/0xca [ 59.691019] [<810a26e6>] kmem_cache_alloc+0x32/0xf3 [ 59.691019] [<810b53ac>] ? __d_lookup_rcu+0x6f/0xf4 [ 59.691019] [<810b68b1>] alloc_inode+0x2a/0x64 [ 59.691019] [<810b6903>] new_inode+0x18/0x43 [ 59.691019] [<81142447>] pstore_get_inode.isra.1+0x11/0x98 [ 59.691019] [<81142623>] pstore_mkfile+0xae/0x26f [ 59.691019] [<810a2a66>] ? kmem_cache_free+0x19/0xb1 [ 59.691019] [<8116c821>] ? ida_get_new_above+0x140/0x158 [ 59.691019] [<811708ea>] ? __init_rwsem+0x1e/0x2c [ 59.691019] [<810b67e8>] ? inode_init_always+0x111/0x1b0 [ 59.691019] [<8102127e>] ? should_resched+0xd/0x27 [ 59.691019] [<8137977f>] ? _cond_resched+0xd/0x21 [ 59.691019] [<81142abf>] pstore_get_records+0x52/0xa7 [ 59.691019] [<8114254b>] pstore_fill_super+0x7d/0x91 [ 59.691019] [<810a7ff5>] mount_single+0x46/0x82 [ 59.691019] [<8114231a>] pstore_mount+0x15/0x17 [ 59.691019] [<811424ce>] ? pstore_get_inode.isra.1+0x98/0x98 [ 59.691019] [<810a8199>] mount_fs+0x5a/0x12d [ 59.691019] [<810b9174>] ? alloc_vfsmnt+0xa4/0x14a [ 59.691019] [<810b9474>] vfs_kern_mount+0x4f/0x7d [ 59.691019] [<810b9d7e>] do_kern_mount+0x34/0xb2 [ 59.691019] [<810bb15f>] do_mount+0x5fc/0x64a [ 59.691019] [<810912fb>] ? strndup_user+0x2e/0x3f [ 59.691019] [<810bb3cb>] sys_mount+0x66/0x99 [ 59.691019] [<8137b537>] sysenter_do_call+0x12/0x26 Signed-off-by: Kees Cook <keescook@chromium.org> Signed-off-by: Tony Luck <tony.luck@intel.com>
2011-11-18 04:58:07 +08:00
mutex_init(&psinfo->read_mutex);
pstore: Don't use semaphores in always-atomic-context code pstore_dump() is *always* invoked in atomic context (nowadays in an RCU read-side critical section, before that under a spinlock). It doesn't make sense to try to use semaphores here. This is mostly a revert of commit ea84b580b955 ("pstore: Convert buf_lock to semaphore"), except that two parts aren't restored back exactly as they were: - keep the lock initialization in pstore_register - in efi_pstore_write(), always set the "block" flag to false - omit "is_locked", that was unnecessary since commit 959217c84c27 ("pstore: Actually give up during locking failure") - fix the bailout message The actual problem that the buggy commit was trying to address may have been that the use of preemptible() in efi_pstore_write() was wrong - it only looks at preempt_count() and the state of IRQs, but __rcu_read_lock() doesn't touch either of those under CONFIG_PREEMPT_RCU. (Sidenote: CONFIG_PREEMPT_RCU means that the scheduler can preempt tasks in RCU read-side critical sections, but you're not allowed to actively block/reschedule.) Lockdep probably never caught the problem because it's very rare that you actually hit the contended case, so lockdep always just sees the down_trylock(), not the down_interruptible(), and so it can't tell that there's a problem. Fixes: ea84b580b955 ("pstore: Convert buf_lock to semaphore") Cc: stable@vger.kernel.org Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Signed-off-by: Jann Horn <jannh@google.com> Signed-off-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/r/20220314185953.2068993-1-jannh@google.com
2022-03-15 02:59:53 +08:00
spin_lock_init(&psinfo->buf_lock);
if (psi->flags & PSTORE_FLAGS_DMESG)
allocate_buf_for_compression();
pstore_get_records(0);
if (psi->flags & PSTORE_FLAGS_DMESG) {
pstore_dumper.max_reason = psinfo->max_reason;
pstore_register_kmsg();
}
if (psi->flags & PSTORE_FLAGS_CONSOLE)
pstore_register_console();
if (psi->flags & PSTORE_FLAGS_FTRACE)
pstore_register_ftrace();
if (psi->flags & PSTORE_FLAGS_PMSG)
pstore_register_pmsg();
/* Start watching for new records, if desired. */
pstore_timer_kick();
/*
* Update the module parameter backend, so it is visible
* through /sys/module/pstore/parameters/backend
*/
backend = kstrdup(psi->name, GFP_KERNEL);
pr_info("Registered %s as persistent store backend\n", psi->name);
mutex_unlock(&psinfo_lock);
return 0;
}
EXPORT_SYMBOL_GPL(pstore_register);
void pstore_unregister(struct pstore_info *psi)
{
/* It's okay to unregister nothing. */
if (!psi)
return;
mutex_lock(&psinfo_lock);
/* Only one backend can be registered at a time. */
if (WARN_ON(psi != psinfo)) {
mutex_unlock(&psinfo_lock);
return;
}
/* Unregister all callbacks. */
if (psi->flags & PSTORE_FLAGS_PMSG)
pstore_unregister_pmsg();
if (psi->flags & PSTORE_FLAGS_FTRACE)
pstore_unregister_ftrace();
if (psi->flags & PSTORE_FLAGS_CONSOLE)
pstore_unregister_console();
if (psi->flags & PSTORE_FLAGS_DMESG)
pstore_unregister_kmsg();
/* Stop timer and make sure all work has finished. */
del_timer_sync(&pstore_timer);
flush_work(&pstore_work);
/* Remove all backend records from filesystem tree. */
pstore_put_backend_records(psi);
free_buf_for_compression();
psinfo = NULL;
kfree(backend);
backend = NULL;
mutex_unlock(&psinfo_lock);
}
EXPORT_SYMBOL_GPL(pstore_unregister);
static void decompress_record(struct pstore_record *record)
{
int ret;
int unzipped_len;
char *unzipped, *workspace;
if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !record->compressed)
return;
/* Only PSTORE_TYPE_DMESG support compression. */
if (record->type != PSTORE_TYPE_DMESG) {
pr_warn("ignored compressed record type %d\n", record->type);
return;
}
/* Missing compression buffer means compression was not initialized. */
if (!big_oops_buf) {
pr_warn("no decompression method initialized!\n");
return;
}
/* Allocate enough space to hold max decompression and ECC. */
unzipped_len = big_oops_buf_sz;
workspace = kmalloc(unzipped_len + record->ecc_notice_size,
GFP_KERNEL);
if (!workspace)
return;
/* After decompression "unzipped_len" is almost certainly smaller. */
ret = crypto_comp_decompress(tfm, record->buf, record->size,
workspace, &unzipped_len);
if (ret) {
pr_err("crypto_comp_decompress failed, ret = %d!\n", ret);
kfree(workspace);
return;
}
/* Append ECC notice to decompressed buffer. */
memcpy(workspace + unzipped_len, record->buf + record->size,
record->ecc_notice_size);
/* Copy decompressed contents into an minimum-sized allocation. */
unzipped = kmemdup(workspace, unzipped_len + record->ecc_notice_size,
GFP_KERNEL);
kfree(workspace);
if (!unzipped)
return;
/* Swap out compressed contents with decompressed contents. */
kfree(record->buf);
record->buf = unzipped;
record->size = unzipped_len;
record->compressed = false;
}
/*
* Read all the records from one persistent store backend. Create
* files in our filesystem. Don't warn about -EEXIST errors
* when we are re-scanning the backing store looking to add new
* error records.
*/
void pstore_get_backend_records(struct pstore_info *psi,
struct dentry *root, int quiet)
{
int failed = 0;
unsigned int stop_loop = 65536;
if (!psi || !root)
return;
pstore: pass allocated memory region back to caller The buf_lock cannot be held while populating the inodes, so make the backend pass forward an allocated and filled buffer instead. This solves the following backtrace. The effect is that "buf" is only ever used to notify the backends that something was written to it, and shouldn't be used in the read path. To replace the buf_lock during the read path, isolate the open/read/close loop with a separate mutex to maintain serialized access to the backend. Note that is is up to the pstore backend to cope if the (*write)() path is called in the middle of the read path. [ 59.691019] BUG: sleeping function called from invalid context at .../mm/slub.c:847 [ 59.691019] in_atomic(): 0, irqs_disabled(): 1, pid: 1819, name: mount [ 59.691019] Pid: 1819, comm: mount Not tainted 3.0.8 #1 [ 59.691019] Call Trace: [ 59.691019] [<810252d5>] __might_sleep+0xc3/0xca [ 59.691019] [<810a26e6>] kmem_cache_alloc+0x32/0xf3 [ 59.691019] [<810b53ac>] ? __d_lookup_rcu+0x6f/0xf4 [ 59.691019] [<810b68b1>] alloc_inode+0x2a/0x64 [ 59.691019] [<810b6903>] new_inode+0x18/0x43 [ 59.691019] [<81142447>] pstore_get_inode.isra.1+0x11/0x98 [ 59.691019] [<81142623>] pstore_mkfile+0xae/0x26f [ 59.691019] [<810a2a66>] ? kmem_cache_free+0x19/0xb1 [ 59.691019] [<8116c821>] ? ida_get_new_above+0x140/0x158 [ 59.691019] [<811708ea>] ? __init_rwsem+0x1e/0x2c [ 59.691019] [<810b67e8>] ? inode_init_always+0x111/0x1b0 [ 59.691019] [<8102127e>] ? should_resched+0xd/0x27 [ 59.691019] [<8137977f>] ? _cond_resched+0xd/0x21 [ 59.691019] [<81142abf>] pstore_get_records+0x52/0xa7 [ 59.691019] [<8114254b>] pstore_fill_super+0x7d/0x91 [ 59.691019] [<810a7ff5>] mount_single+0x46/0x82 [ 59.691019] [<8114231a>] pstore_mount+0x15/0x17 [ 59.691019] [<811424ce>] ? pstore_get_inode.isra.1+0x98/0x98 [ 59.691019] [<810a8199>] mount_fs+0x5a/0x12d [ 59.691019] [<810b9174>] ? alloc_vfsmnt+0xa4/0x14a [ 59.691019] [<810b9474>] vfs_kern_mount+0x4f/0x7d [ 59.691019] [<810b9d7e>] do_kern_mount+0x34/0xb2 [ 59.691019] [<810bb15f>] do_mount+0x5fc/0x64a [ 59.691019] [<810912fb>] ? strndup_user+0x2e/0x3f [ 59.691019] [<810bb3cb>] sys_mount+0x66/0x99 [ 59.691019] [<8137b537>] sysenter_do_call+0x12/0x26 Signed-off-by: Kees Cook <keescook@chromium.org> Signed-off-by: Tony Luck <tony.luck@intel.com>
2011-11-18 04:58:07 +08:00
mutex_lock(&psi->read_mutex);
if (psi->open && psi->open(psi))
goto out;
/*
* Backend callback read() allocates record.buf. decompress_record()
* may reallocate record.buf. On success, pstore_mkfile() will keep
* the record.buf, so free it only on failure.
*/
for (; stop_loop; stop_loop--) {
struct pstore_record *record;
int rc;
record = kzalloc(sizeof(*record), GFP_KERNEL);
if (!record) {
pr_err("out of memory creating record\n");
break;
}
pstore_record_init(record, psi);
record->size = psi->read(record);
/* No more records left in backend? */
if (record->size <= 0) {
kfree(record);
break;
}
decompress_record(record);
rc = pstore_mkfile(root, record);
if (rc) {
/* pstore_mkfile() did not take record, so free it. */
kfree(record->buf);
kfree(record);
if (rc != -EEXIST || !quiet)
failed++;
}
}
if (psi->close)
psi->close(psi);
out:
pstore: pass allocated memory region back to caller The buf_lock cannot be held while populating the inodes, so make the backend pass forward an allocated and filled buffer instead. This solves the following backtrace. The effect is that "buf" is only ever used to notify the backends that something was written to it, and shouldn't be used in the read path. To replace the buf_lock during the read path, isolate the open/read/close loop with a separate mutex to maintain serialized access to the backend. Note that is is up to the pstore backend to cope if the (*write)() path is called in the middle of the read path. [ 59.691019] BUG: sleeping function called from invalid context at .../mm/slub.c:847 [ 59.691019] in_atomic(): 0, irqs_disabled(): 1, pid: 1819, name: mount [ 59.691019] Pid: 1819, comm: mount Not tainted 3.0.8 #1 [ 59.691019] Call Trace: [ 59.691019] [<810252d5>] __might_sleep+0xc3/0xca [ 59.691019] [<810a26e6>] kmem_cache_alloc+0x32/0xf3 [ 59.691019] [<810b53ac>] ? __d_lookup_rcu+0x6f/0xf4 [ 59.691019] [<810b68b1>] alloc_inode+0x2a/0x64 [ 59.691019] [<810b6903>] new_inode+0x18/0x43 [ 59.691019] [<81142447>] pstore_get_inode.isra.1+0x11/0x98 [ 59.691019] [<81142623>] pstore_mkfile+0xae/0x26f [ 59.691019] [<810a2a66>] ? kmem_cache_free+0x19/0xb1 [ 59.691019] [<8116c821>] ? ida_get_new_above+0x140/0x158 [ 59.691019] [<811708ea>] ? __init_rwsem+0x1e/0x2c [ 59.691019] [<810b67e8>] ? inode_init_always+0x111/0x1b0 [ 59.691019] [<8102127e>] ? should_resched+0xd/0x27 [ 59.691019] [<8137977f>] ? _cond_resched+0xd/0x21 [ 59.691019] [<81142abf>] pstore_get_records+0x52/0xa7 [ 59.691019] [<8114254b>] pstore_fill_super+0x7d/0x91 [ 59.691019] [<810a7ff5>] mount_single+0x46/0x82 [ 59.691019] [<8114231a>] pstore_mount+0x15/0x17 [ 59.691019] [<811424ce>] ? pstore_get_inode.isra.1+0x98/0x98 [ 59.691019] [<810a8199>] mount_fs+0x5a/0x12d [ 59.691019] [<810b9174>] ? alloc_vfsmnt+0xa4/0x14a [ 59.691019] [<810b9474>] vfs_kern_mount+0x4f/0x7d [ 59.691019] [<810b9d7e>] do_kern_mount+0x34/0xb2 [ 59.691019] [<810bb15f>] do_mount+0x5fc/0x64a [ 59.691019] [<810912fb>] ? strndup_user+0x2e/0x3f [ 59.691019] [<810bb3cb>] sys_mount+0x66/0x99 [ 59.691019] [<8137b537>] sysenter_do_call+0x12/0x26 Signed-off-by: Kees Cook <keescook@chromium.org> Signed-off-by: Tony Luck <tony.luck@intel.com>
2011-11-18 04:58:07 +08:00
mutex_unlock(&psi->read_mutex);
if (failed)
pr_warn("failed to create %d record(s) from '%s'\n",
failed, psi->name);
if (!stop_loop)
pr_err("looping? Too many records seen from '%s'\n",
psi->name);
}
static void pstore_dowork(struct work_struct *work)
{
pstore_get_records(1);
}
static void pstore_timefunc(struct timer_list *unused)
{
if (pstore_new_entry) {
pstore_new_entry = 0;
schedule_work(&pstore_work);
}
pstore_timer_kick();
}
static void __init pstore_choose_compression(void)
{
const struct pstore_zbackend *step;
if (!compress)
return;
for (step = zbackends; step->name; step++) {
if (!strcmp(compress, step->name)) {
zbackend = step;
return;
}
}
}
static int __init pstore_init(void)
{
int ret;
pstore_choose_compression();
/*
* Check if any pstore backends registered earlier but did not
* initialize compression because crypto was not ready. If so,
* initialize compression now.
*/
allocate_buf_for_compression();
ret = pstore_init_fs();
if (ret)
free_buf_for_compression();
return ret;
}
late_initcall(pstore_init);
static void __exit pstore_exit(void)
{
pstore_exit_fs();
}
module_exit(pstore_exit)
MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>");
MODULE_LICENSE("GPL");