linux/drivers/base/devcoredump.c
Mukesh Ojha 01daccf748 devcoredump : Serialize devcd_del work
In following scenario(diagram), when one thread X running dev_coredumpm()
adds devcd device to the framework which sends uevent notification to
userspace and another thread Y reads this uevent and call to
devcd_data_write() which eventually try to delete the queued timer that
is not initialized/queued yet.

So, debug object reports some warning and in the meantime, timer is
initialized and queued from X path. and from Y path, it gets reinitialized
again and timer->entry.pprev=NULL and try_to_grab_pending() stucks.

To fix this, introduce mutex and a boolean flag to serialize the behaviour.

 	cpu0(X)			                cpu1(Y)

    dev_coredump() uevent sent to user space
    device_add()  ======================> user space process Y reads the
                                          uevents writes to devcd fd
                                          which results into writes to

                                         devcd_data_write()
                                           mod_delayed_work()
                                             try_to_grab_pending()
                                               del_timer()
                                                 debug_assert_init()
   INIT_DELAYED_WORK()
   schedule_delayed_work()
                                                   debug_object_fixup()
                                                     timer_fixup_assert_init()
                                                       timer_setup()
                                                         do_init_timer()
                                                       /*
                                                        Above call reinitializes
                                                        the timer to
                                                        timer->entry.pprev=NULL
                                                        and this will be checked
                                                        later in timer_pending() call.
                                                       */
                                                 timer_pending()
                                                  !hlist_unhashed_lockless(&timer->entry)
                                                    !h->pprev
                                                /*
                                                  del_timer() checks h->pprev and finds
                                                  it to be NULL due to which
                                                  try_to_grab_pending() stucks.
                                                */

Link: https://lore.kernel.org/lkml/2e1f81e2-428c-f11f-ce92-eb11048cb271@quicinc.com/
Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com>
Link: https://lore.kernel.org/r/1663073424-13663-1-git-send-email-quic_mojha@quicinc.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2022-09-24 14:01:40 +02:00

427 lines
12 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright(c) 2014 Intel Mobile Communications GmbH
* Copyright(c) 2015 Intel Deutschland GmbH
*
* Author: Johannes Berg <johannes@sipsolutions.net>
*/
#include <linux/module.h>
#include <linux/device.h>
#include <linux/devcoredump.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/workqueue.h>
static struct class devcd_class;
/* global disable flag, for security purposes */
static bool devcd_disabled;
/* if data isn't read by userspace after 5 minutes then delete it */
#define DEVCD_TIMEOUT (HZ * 60 * 5)
struct devcd_entry {
struct device devcd_dev;
void *data;
size_t datalen;
/*
* Here, mutex is required to serialize the calls to del_wk work between
* user/kernel space which happens when devcd is added with device_add()
* and that sends uevent to user space. User space reads the uevents,
* and calls to devcd_data_write() which try to modify the work which is
* not even initialized/queued from devcoredump.
*
*
*
* cpu0(X) cpu1(Y)
*
* dev_coredump() uevent sent to user space
* device_add() ======================> user space process Y reads the
* uevents writes to devcd fd
* which results into writes to
*
* devcd_data_write()
* mod_delayed_work()
* try_to_grab_pending()
* del_timer()
* debug_assert_init()
* INIT_DELAYED_WORK()
* schedule_delayed_work()
*
*
* Also, mutex alone would not be enough to avoid scheduling of
* del_wk work after it get flush from a call to devcd_free()
* mentioned as below.
*
* disabled_store()
* devcd_free()
* mutex_lock() devcd_data_write()
* flush_delayed_work()
* mutex_unlock()
* mutex_lock()
* mod_delayed_work()
* mutex_unlock()
* So, delete_work flag is required.
*/
struct mutex mutex;
bool delete_work;
struct module *owner;
ssize_t (*read)(char *buffer, loff_t offset, size_t count,
void *data, size_t datalen);
void (*free)(void *data);
struct delayed_work del_wk;
struct device *failing_dev;
};
static struct devcd_entry *dev_to_devcd(struct device *dev)
{
return container_of(dev, struct devcd_entry, devcd_dev);
}
static void devcd_dev_release(struct device *dev)
{
struct devcd_entry *devcd = dev_to_devcd(dev);
devcd->free(devcd->data);
module_put(devcd->owner);
/*
* this seems racy, but I don't see a notifier or such on
* a struct device to know when it goes away?
*/
if (devcd->failing_dev->kobj.sd)
sysfs_delete_link(&devcd->failing_dev->kobj, &dev->kobj,
"devcoredump");
put_device(devcd->failing_dev);
kfree(devcd);
}
static void devcd_del(struct work_struct *wk)
{
struct devcd_entry *devcd;
devcd = container_of(wk, struct devcd_entry, del_wk.work);
device_del(&devcd->devcd_dev);
put_device(&devcd->devcd_dev);
}
static ssize_t devcd_data_read(struct file *filp, struct kobject *kobj,
struct bin_attribute *bin_attr,
char *buffer, loff_t offset, size_t count)
{
struct device *dev = kobj_to_dev(kobj);
struct devcd_entry *devcd = dev_to_devcd(dev);
return devcd->read(buffer, offset, count, devcd->data, devcd->datalen);
}
static ssize_t devcd_data_write(struct file *filp, struct kobject *kobj,
struct bin_attribute *bin_attr,
char *buffer, loff_t offset, size_t count)
{
struct device *dev = kobj_to_dev(kobj);
struct devcd_entry *devcd = dev_to_devcd(dev);
mutex_lock(&devcd->mutex);
if (!devcd->delete_work) {
devcd->delete_work = true;
mod_delayed_work(system_wq, &devcd->del_wk, 0);
}
mutex_unlock(&devcd->mutex);
return count;
}
static struct bin_attribute devcd_attr_data = {
.attr = { .name = "data", .mode = S_IRUSR | S_IWUSR, },
.size = 0,
.read = devcd_data_read,
.write = devcd_data_write,
};
static struct bin_attribute *devcd_dev_bin_attrs[] = {
&devcd_attr_data, NULL,
};
static const struct attribute_group devcd_dev_group = {
.bin_attrs = devcd_dev_bin_attrs,
};
static const struct attribute_group *devcd_dev_groups[] = {
&devcd_dev_group, NULL,
};
static int devcd_free(struct device *dev, void *data)
{
struct devcd_entry *devcd = dev_to_devcd(dev);
mutex_lock(&devcd->mutex);
if (!devcd->delete_work)
devcd->delete_work = true;
flush_delayed_work(&devcd->del_wk);
mutex_unlock(&devcd->mutex);
return 0;
}
static ssize_t disabled_show(struct class *class, struct class_attribute *attr,
char *buf)
{
return sysfs_emit(buf, "%d\n", devcd_disabled);
}
/*
*
* disabled_store() worker()
* class_for_each_device(&devcd_class,
* NULL, NULL, devcd_free)
* ...
* ...
* while ((dev = class_dev_iter_next(&iter))
* devcd_del()
* device_del()
* put_device() <- last reference
* error = fn(dev, data) devcd_dev_release()
* devcd_free(dev, data) kfree(devcd)
* mutex_lock(&devcd->mutex);
*
*
* In the above diagram, It looks like disabled_store() would be racing with parallely
* running devcd_del() and result in memory abort while acquiring devcd->mutex which
* is called after kfree of devcd memory after dropping its last reference with
* put_device(). However, this will not happens as fn(dev, data) runs
* with its own reference to device via klist_node so it is not its last reference.
* so, above situation would not occur.
*/
static ssize_t disabled_store(struct class *class, struct class_attribute *attr,
const char *buf, size_t count)
{
long tmp = simple_strtol(buf, NULL, 10);
/*
* This essentially makes the attribute write-once, since you can't
* go back to not having it disabled. This is intentional, it serves
* as a system lockdown feature.
*/
if (tmp != 1)
return -EINVAL;
devcd_disabled = true;
class_for_each_device(&devcd_class, NULL, NULL, devcd_free);
return count;
}
static CLASS_ATTR_RW(disabled);
static struct attribute *devcd_class_attrs[] = {
&class_attr_disabled.attr,
NULL,
};
ATTRIBUTE_GROUPS(devcd_class);
static struct class devcd_class = {
.name = "devcoredump",
.owner = THIS_MODULE,
.dev_release = devcd_dev_release,
.dev_groups = devcd_dev_groups,
.class_groups = devcd_class_groups,
};
static ssize_t devcd_readv(char *buffer, loff_t offset, size_t count,
void *data, size_t datalen)
{
return memory_read_from_buffer(buffer, count, &offset, data, datalen);
}
static void devcd_freev(void *data)
{
vfree(data);
}
/**
* dev_coredumpv - create device coredump with vmalloc data
* @dev: the struct device for the crashed device
* @data: vmalloc data containing the device coredump
* @datalen: length of the data
* @gfp: allocation flags
*
* This function takes ownership of the vmalloc'ed data and will free
* it when it is no longer used. See dev_coredumpm() for more information.
*/
void dev_coredumpv(struct device *dev, void *data, size_t datalen,
gfp_t gfp)
{
dev_coredumpm(dev, NULL, data, datalen, gfp, devcd_readv, devcd_freev);
}
EXPORT_SYMBOL_GPL(dev_coredumpv);
static int devcd_match_failing(struct device *dev, const void *failing)
{
struct devcd_entry *devcd = dev_to_devcd(dev);
return devcd->failing_dev == failing;
}
/**
* devcd_free_sgtable - free all the memory of the given scatterlist table
* (i.e. both pages and scatterlist instances)
* NOTE: if two tables allocated with devcd_alloc_sgtable and then chained
* using the sg_chain function then that function should be called only once
* on the chained table
* @data: pointer to sg_table to free
*/
static void devcd_free_sgtable(void *data)
{
_devcd_free_sgtable(data);
}
/**
* devcd_read_from_sgtable - copy data from sg_table to a given buffer
* and return the number of bytes read
* @buffer: the buffer to copy the data to it
* @buf_len: the length of the buffer
* @data: the scatterlist table to copy from
* @offset: start copy from @offset@ bytes from the head of the data
* in the given scatterlist
* @data_len: the length of the data in the sg_table
*/
static ssize_t devcd_read_from_sgtable(char *buffer, loff_t offset,
size_t buf_len, void *data,
size_t data_len)
{
struct scatterlist *table = data;
if (offset > data_len)
return -EINVAL;
if (offset + buf_len > data_len)
buf_len = data_len - offset;
return sg_pcopy_to_buffer(table, sg_nents(table), buffer, buf_len,
offset);
}
/**
* dev_coredumpm - create device coredump with read/free methods
* @dev: the struct device for the crashed device
* @owner: the module that contains the read/free functions, use %THIS_MODULE
* @data: data cookie for the @read/@free functions
* @datalen: length of the data
* @gfp: allocation flags
* @read: function to read from the given buffer
* @free: function to free the given buffer
*
* Creates a new device coredump for the given device. If a previous one hasn't
* been read yet, the new coredump is discarded. The data lifetime is determined
* by the device coredump framework and when it is no longer needed the @free
* function will be called to free the data.
*/
void dev_coredumpm(struct device *dev, struct module *owner,
void *data, size_t datalen, gfp_t gfp,
ssize_t (*read)(char *buffer, loff_t offset, size_t count,
void *data, size_t datalen),
void (*free)(void *data))
{
static atomic_t devcd_count = ATOMIC_INIT(0);
struct devcd_entry *devcd;
struct device *existing;
if (devcd_disabled)
goto free;
existing = class_find_device(&devcd_class, NULL, dev,
devcd_match_failing);
if (existing) {
put_device(existing);
goto free;
}
if (!try_module_get(owner))
goto free;
devcd = kzalloc(sizeof(*devcd), gfp);
if (!devcd)
goto put_module;
devcd->owner = owner;
devcd->data = data;
devcd->datalen = datalen;
devcd->read = read;
devcd->free = free;
devcd->failing_dev = get_device(dev);
devcd->delete_work = false;
mutex_init(&devcd->mutex);
device_initialize(&devcd->devcd_dev);
dev_set_name(&devcd->devcd_dev, "devcd%d",
atomic_inc_return(&devcd_count));
devcd->devcd_dev.class = &devcd_class;
mutex_lock(&devcd->mutex);
if (device_add(&devcd->devcd_dev))
goto put_device;
/*
* These should normally not fail, but there is no problem
* continuing without the links, so just warn instead of
* failing.
*/
if (sysfs_create_link(&devcd->devcd_dev.kobj, &dev->kobj,
"failing_device") ||
sysfs_create_link(&dev->kobj, &devcd->devcd_dev.kobj,
"devcoredump"))
dev_warn(dev, "devcoredump create_link failed\n");
INIT_DELAYED_WORK(&devcd->del_wk, devcd_del);
schedule_delayed_work(&devcd->del_wk, DEVCD_TIMEOUT);
mutex_unlock(&devcd->mutex);
return;
put_device:
put_device(&devcd->devcd_dev);
mutex_unlock(&devcd->mutex);
put_module:
module_put(owner);
free:
free(data);
}
EXPORT_SYMBOL_GPL(dev_coredumpm);
/**
* dev_coredumpsg - create device coredump that uses scatterlist as data
* parameter
* @dev: the struct device for the crashed device
* @table: the dump data
* @datalen: length of the data
* @gfp: allocation flags
*
* Creates a new device coredump for the given device. If a previous one hasn't
* been read yet, the new coredump is discarded. The data lifetime is determined
* by the device coredump framework and when it is no longer needed
* it will free the data.
*/
void dev_coredumpsg(struct device *dev, struct scatterlist *table,
size_t datalen, gfp_t gfp)
{
dev_coredumpm(dev, NULL, table, datalen, gfp, devcd_read_from_sgtable,
devcd_free_sgtable);
}
EXPORT_SYMBOL_GPL(dev_coredumpsg);
static int __init devcoredump_init(void)
{
return class_register(&devcd_class);
}
__initcall(devcoredump_init);
static void __exit devcoredump_exit(void)
{
class_for_each_device(&devcd_class, NULL, NULL, devcd_free);
class_unregister(&devcd_class);
}
__exitcall(devcoredump_exit);