2019-06-04 16:11:33 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2015-01-21 00:36:55 +08:00
|
|
|
/*
|
|
|
|
* inode.c - part of tracefs, a pseudo file system for activating tracing
|
|
|
|
*
|
|
|
|
* Based on debugfs by: Greg Kroah-Hartman <greg@kroah.com>
|
|
|
|
*
|
|
|
|
* Copyright (C) 2014 Red Hat Inc, author: Steven Rostedt <srostedt@redhat.com>
|
|
|
|
*
|
|
|
|
* tracefs is the file system that is used by the tracing infrastructure.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/fs.h>
|
|
|
|
#include <linux/mount.h>
|
2015-01-22 00:28:23 +08:00
|
|
|
#include <linux/kobject.h>
|
2015-01-21 00:36:55 +08:00
|
|
|
#include <linux/namei.h>
|
|
|
|
#include <linux/tracefs.h>
|
|
|
|
#include <linux/fsnotify.h>
|
2019-10-12 08:41:41 +08:00
|
|
|
#include <linux/security.h>
|
2015-01-21 00:36:55 +08:00
|
|
|
#include <linux/seq_file.h>
|
|
|
|
#include <linux/parser.h>
|
|
|
|
#include <linux/magic.h>
|
|
|
|
#include <linux/slab.h>
|
2023-07-29 02:20:44 +08:00
|
|
|
#include "internal.h"
|
2015-01-21 00:36:55 +08:00
|
|
|
|
|
|
|
#define TRACEFS_DEFAULT_MODE 0700
|
2023-07-29 02:20:44 +08:00
|
|
|
static struct kmem_cache *tracefs_inode_cachep __ro_after_init;
|
2015-01-21 00:36:55 +08:00
|
|
|
|
|
|
|
static struct vfsmount *tracefs_mount;
|
|
|
|
static int tracefs_mount_count;
|
|
|
|
static bool tracefs_registered;
|
|
|
|
|
2023-07-29 02:20:44 +08:00
|
|
|
static struct inode *tracefs_alloc_inode(struct super_block *sb)
|
|
|
|
{
|
|
|
|
struct tracefs_inode *ti;
|
|
|
|
|
|
|
|
ti = kmem_cache_alloc(tracefs_inode_cachep, GFP_KERNEL);
|
|
|
|
if (!ti)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
ti->flags = 0;
|
|
|
|
|
|
|
|
return &ti->vfs_inode;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void tracefs_free_inode(struct inode *inode)
|
|
|
|
{
|
|
|
|
kmem_cache_free(tracefs_inode_cachep, get_tracefs(inode));
|
|
|
|
}
|
|
|
|
|
2015-01-21 00:36:55 +08:00
|
|
|
static ssize_t default_read_file(struct file *file, char __user *buf,
|
|
|
|
size_t count, loff_t *ppos)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t default_write_file(struct file *file, const char __user *buf,
|
|
|
|
size_t count, loff_t *ppos)
|
|
|
|
{
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct file_operations tracefs_file_operations = {
|
|
|
|
.read = default_read_file,
|
|
|
|
.write = default_write_file,
|
|
|
|
.open = simple_open,
|
|
|
|
.llseek = noop_llseek,
|
|
|
|
};
|
|
|
|
|
2015-01-21 23:01:39 +08:00
|
|
|
static struct tracefs_dir_ops {
|
|
|
|
int (*mkdir)(const char *name);
|
|
|
|
int (*rmdir)(const char *name);
|
2018-07-26 01:19:01 +08:00
|
|
|
} tracefs_ops __ro_after_init;
|
2015-01-21 23:01:39 +08:00
|
|
|
|
|
|
|
static char *get_dname(struct dentry *dentry)
|
|
|
|
{
|
|
|
|
const char *dname;
|
|
|
|
char *name;
|
|
|
|
int len = dentry->d_name.len;
|
|
|
|
|
|
|
|
dname = dentry->d_name.name;
|
|
|
|
name = kmalloc(len + 1, GFP_KERNEL);
|
|
|
|
if (!name)
|
|
|
|
return NULL;
|
|
|
|
memcpy(name, dname, len);
|
|
|
|
name[len] = 0;
|
|
|
|
return name;
|
|
|
|
}
|
|
|
|
|
2023-01-13 19:49:15 +08:00
|
|
|
static int tracefs_syscall_mkdir(struct mnt_idmap *idmap,
|
2021-01-21 21:19:43 +08:00
|
|
|
struct inode *inode, struct dentry *dentry,
|
|
|
|
umode_t mode)
|
2015-01-21 23:01:39 +08:00
|
|
|
{
|
|
|
|
char *name;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
name = get_dname(dentry);
|
|
|
|
if (!name)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The mkdir call can call the generic functions that create
|
|
|
|
* the files within the tracefs system. It is up to the individual
|
|
|
|
* mkdir routine to handle races.
|
|
|
|
*/
|
2016-01-23 04:40:57 +08:00
|
|
|
inode_unlock(inode);
|
2015-01-21 23:01:39 +08:00
|
|
|
ret = tracefs_ops.mkdir(name);
|
2016-01-23 04:40:57 +08:00
|
|
|
inode_lock(inode);
|
2015-01-21 23:01:39 +08:00
|
|
|
|
|
|
|
kfree(name);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry)
|
|
|
|
{
|
|
|
|
char *name;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
name = get_dname(dentry);
|
|
|
|
if (!name)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The rmdir call can call the generic functions that create
|
|
|
|
* the files within the tracefs system. It is up to the individual
|
|
|
|
* rmdir routine to handle races.
|
|
|
|
* This time we need to unlock not only the parent (inode) but
|
|
|
|
* also the directory that is being deleted.
|
|
|
|
*/
|
2016-01-23 04:40:57 +08:00
|
|
|
inode_unlock(inode);
|
2021-12-08 23:27:31 +08:00
|
|
|
inode_unlock(d_inode(dentry));
|
2015-01-21 23:01:39 +08:00
|
|
|
|
|
|
|
ret = tracefs_ops.rmdir(name);
|
|
|
|
|
2016-01-23 04:40:57 +08:00
|
|
|
inode_lock_nested(inode, I_MUTEX_PARENT);
|
2021-12-08 23:27:31 +08:00
|
|
|
inode_lock(d_inode(dentry));
|
2015-01-21 23:01:39 +08:00
|
|
|
|
|
|
|
kfree(name);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct inode_operations tracefs_dir_inode_operations = {
|
|
|
|
.lookup = simple_lookup,
|
|
|
|
.mkdir = tracefs_syscall_mkdir,
|
|
|
|
.rmdir = tracefs_syscall_rmdir,
|
|
|
|
};
|
|
|
|
|
2023-07-29 02:20:45 +08:00
|
|
|
struct inode *tracefs_get_inode(struct super_block *sb)
|
2015-01-21 00:36:55 +08:00
|
|
|
{
|
|
|
|
struct inode *inode = new_inode(sb);
|
|
|
|
if (inode) {
|
|
|
|
inode->i_ino = get_next_ino();
|
2023-10-05 02:52:57 +08:00
|
|
|
simple_inode_init_ts(inode);
|
2015-01-21 00:36:55 +08:00
|
|
|
}
|
|
|
|
return inode;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct tracefs_mount_opts {
|
|
|
|
kuid_t uid;
|
|
|
|
kgid_t gid;
|
|
|
|
umode_t mode;
|
2022-08-27 08:44:17 +08:00
|
|
|
/* Opt_* bitfield. */
|
|
|
|
unsigned int opts;
|
2015-01-21 00:36:55 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
enum {
|
|
|
|
Opt_uid,
|
|
|
|
Opt_gid,
|
|
|
|
Opt_mode,
|
|
|
|
Opt_err
|
|
|
|
};
|
|
|
|
|
|
|
|
static const match_table_t tokens = {
|
|
|
|
{Opt_uid, "uid=%u"},
|
|
|
|
{Opt_gid, "gid=%u"},
|
|
|
|
{Opt_mode, "mode=%o"},
|
|
|
|
{Opt_err, NULL}
|
|
|
|
};
|
|
|
|
|
|
|
|
struct tracefs_fs_info {
|
|
|
|
struct tracefs_mount_opts mount_opts;
|
|
|
|
};
|
|
|
|
|
2021-12-08 06:17:29 +08:00
|
|
|
static void change_gid(struct dentry *dentry, kgid_t gid)
|
|
|
|
{
|
|
|
|
if (!dentry->d_inode)
|
|
|
|
return;
|
|
|
|
dentry->d_inode->i_gid = gid;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Taken from d_walk, but without he need for handling renames.
|
|
|
|
* Nothing can be renamed while walking the list, as tracefs
|
|
|
|
* does not support renames. This is only called when mounting
|
|
|
|
* or remounting the file system, to set all the files to
|
|
|
|
* the given gid.
|
|
|
|
*/
|
|
|
|
static void set_gid(struct dentry *parent, kgid_t gid)
|
|
|
|
{
|
|
|
|
struct dentry *this_parent;
|
|
|
|
struct list_head *next;
|
|
|
|
|
|
|
|
this_parent = parent;
|
|
|
|
spin_lock(&this_parent->d_lock);
|
|
|
|
|
|
|
|
change_gid(this_parent, gid);
|
|
|
|
repeat:
|
|
|
|
next = this_parent->d_subdirs.next;
|
|
|
|
resume:
|
|
|
|
while (next != &this_parent->d_subdirs) {
|
eventfs: Fix file and directory uid and gid ownership
It was reported that when mounting the tracefs file system with a gid
other than root, the ownership did not carry down to the eventfs directory
due to the dynamic nature of it.
A fix was done to solve this, but it had two issues.
(a) if the attr passed into update_inode_attr() was NULL, it didn't do
anything. This is true for files that have not had a chown or chgrp
done to itself or any of its sibling files, as the attr is allocated
for all children when any one needs it.
# umount /sys/kernel/tracing
# mount -o rw,seclabel,relatime,gid=1000 -t tracefs nodev /mnt
# ls -ld /mnt/events/sched
drwxr-xr-x 28 root rostedt 0 Dec 21 13:12 /mnt/events/sched/
# ls -ld /mnt/events/sched/sched_switch
drwxr-xr-x 2 root rostedt 0 Dec 21 13:12 /mnt/events/sched/sched_switch/
But when checking the files:
# ls -l /mnt/events/sched/sched_switch
total 0
-rw-r----- 1 root root 0 Dec 21 13:12 enable
-rw-r----- 1 root root 0 Dec 21 13:12 filter
-r--r----- 1 root root 0 Dec 21 13:12 format
-r--r----- 1 root root 0 Dec 21 13:12 hist
-r--r----- 1 root root 0 Dec 21 13:12 id
-rw-r----- 1 root root 0 Dec 21 13:12 trigger
(b) When the attr does not denote the UID or GID, it defaulted to using
the parent uid or gid. This is incorrect as changing the parent
uid or gid will automatically change all its children.
# chgrp tracing /mnt/events/timer
# ls -ld /mnt/events/timer
drwxr-xr-x 2 root tracing 0 Dec 21 14:34 /mnt/events/timer
# ls -l /mnt/events/timer
total 0
-rw-r----- 1 root root 0 Dec 21 14:35 enable
-rw-r----- 1 root root 0 Dec 21 14:35 filter
drwxr-xr-x 2 root tracing 0 Dec 21 14:35 hrtimer_cancel
drwxr-xr-x 2 root tracing 0 Dec 21 14:35 hrtimer_expire_entry
drwxr-xr-x 2 root tracing 0 Dec 21 14:35 hrtimer_expire_exit
drwxr-xr-x 2 root tracing 0 Dec 21 14:35 hrtimer_init
drwxr-xr-x 2 root tracing 0 Dec 21 14:35 hrtimer_start
drwxr-xr-x 2 root tracing 0 Dec 21 14:35 itimer_expire
drwxr-xr-x 2 root tracing 0 Dec 21 14:35 itimer_state
drwxr-xr-x 2 root tracing 0 Dec 21 14:35 tick_stop
drwxr-xr-x 2 root tracing 0 Dec 21 14:35 timer_cancel
drwxr-xr-x 2 root tracing 0 Dec 21 14:35 timer_expire_entry
drwxr-xr-x 2 root tracing 0 Dec 21 14:35 timer_expire_exit
drwxr-xr-x 2 root tracing 0 Dec 21 14:35 timer_init
drwxr-xr-x 2 root tracing 0 Dec 21 14:35 timer_start
At first it was thought that this could be easily fixed by just making the
default ownership of the superblock when it was mounted. But this does not
handle the case of:
# chgrp tracing instances
# mkdir instances/foo
If the superblock was used, then the group ownership would be that of what
it was when it was mounted, when it should instead be "tracing".
Instead, set a flag for the top level eventfs directory ("events") to flag
which eventfs_inode belongs to it.
Since the "events" directory's dentry and inode are never freed, it does
not need to use its attr field to restore its mode and ownership. Use the
this eventfs_inode's attr as the default ownership for all the files and
directories underneath it.
When the events eventfs_inode is created, it sets its ownership to its
parent uid and gid. As the events directory is created at boot up before
it gets mounted, this will always be uid=0 and gid=0. If it's created via
an instance, then it will take the ownership of the instance directory.
When the file system is mounted, it will update all the gids if one is
specified. This will have a callback to update the events evenfs_inode's
default entries.
When a file or directory is created under the events directory, it will
walk the ei->dentry parents until it finds the evenfs_inode that belongs
to the events directory to retrieve the default uid and gid values.
Link: https://lore.kernel.org/all/CAHk-=wiwQtUHvzwyZucDq8=Gtw+AnwScyLhpFswrQ84PjhoGsg@mail.gmail.com/
Link: https://lore.kernel.org/linux-trace-kernel/20231221190757.7eddbca9@gandalf.local.home
Cc: stable@vger.kernel.org
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Dongliang Cui <cuidongliang390@gmail.com>
Cc: Hongyu Jin <hongyu.jin@unisoc.com>
Fixes: 0dfc852b6fe3 ("eventfs: Have event files and directories default to parent uid and gid")
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Tested-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-12-22 08:07:57 +08:00
|
|
|
struct tracefs_inode *ti;
|
2021-12-08 06:17:29 +08:00
|
|
|
struct list_head *tmp = next;
|
|
|
|
struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
|
|
|
|
next = tmp->next;
|
|
|
|
|
tracefs: Check for dentry->d_inode exists in set_gid()
If a getdents() is called on the tracefs directory but does not get all
the files, it can leave a "cursor" dentry in the d_subdirs list of tracefs
dentry. This cursor dentry does not have a d_inode for it. Before
referencing tracefs_inode from the dentry, the d_inode must first be
checked if it has content. If not, then it's not a tracefs_inode and can
be ignored.
The following caused a crash:
#define getdents64(fd, dirp, count) syscall(SYS_getdents64, fd, dirp, count)
#define BUF_SIZE 256
#define TDIR "/tmp/file0"
int main(void)
{
char buf[BUF_SIZE];
int fd;
int n;
mkdir(TDIR, 0777);
mount(NULL, TDIR, "tracefs", 0, NULL);
fd = openat(AT_FDCWD, TDIR, O_RDONLY);
n = getdents64(fd, buf, BUF_SIZE);
ret = mount(NULL, TDIR, NULL, MS_NOSUID|MS_REMOUNT|MS_RELATIME|MS_LAZYTIME,
"gid=1000");
return 0;
}
That's because the 256 BUF_SIZE was not big enough to read all the
dentries of the tracefs file system and it left a "cursor" dentry in the
subdirs of the tracefs root inode. Then on remounting with "gid=1000",
it would cause an iteration of all dentries which hit:
ti = get_tracefs(dentry->d_inode);
if (ti && (ti->flags & TRACEFS_EVENT_INODE))
eventfs_update_gid(dentry, gid);
Which crashed because of the dereference of the cursor dentry which had a NULL
d_inode.
In the subdir loop of the dentry lookup of set_gid(), if a child has a
NULL d_inode, simply skip it.
Link: https://lore.kernel.org/all/20240102135637.3a21fb10@gandalf.local.home/
Link: https://lore.kernel.org/linux-trace-kernel/20240102151249.05da244d@gandalf.local.home
Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Fixes: 7e8358edf503e ("eventfs: Fix file and directory uid and gid ownership")
Reported-by: "Ubisectech Sirius" <bugreport@ubisectech.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2024-01-03 04:12:49 +08:00
|
|
|
/* Note, getdents() can add a cursor dentry with no inode */
|
|
|
|
if (!dentry->d_inode)
|
|
|
|
continue;
|
|
|
|
|
2021-12-08 06:17:29 +08:00
|
|
|
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
|
|
|
|
|
|
|
|
change_gid(dentry, gid);
|
|
|
|
|
eventfs: Fix file and directory uid and gid ownership
It was reported that when mounting the tracefs file system with a gid
other than root, the ownership did not carry down to the eventfs directory
due to the dynamic nature of it.
A fix was done to solve this, but it had two issues.
(a) if the attr passed into update_inode_attr() was NULL, it didn't do
anything. This is true for files that have not had a chown or chgrp
done to itself or any of its sibling files, as the attr is allocated
for all children when any one needs it.
# umount /sys/kernel/tracing
# mount -o rw,seclabel,relatime,gid=1000 -t tracefs nodev /mnt
# ls -ld /mnt/events/sched
drwxr-xr-x 28 root rostedt 0 Dec 21 13:12 /mnt/events/sched/
# ls -ld /mnt/events/sched/sched_switch
drwxr-xr-x 2 root rostedt 0 Dec 21 13:12 /mnt/events/sched/sched_switch/
But when checking the files:
# ls -l /mnt/events/sched/sched_switch
total 0
-rw-r----- 1 root root 0 Dec 21 13:12 enable
-rw-r----- 1 root root 0 Dec 21 13:12 filter
-r--r----- 1 root root 0 Dec 21 13:12 format
-r--r----- 1 root root 0 Dec 21 13:12 hist
-r--r----- 1 root root 0 Dec 21 13:12 id
-rw-r----- 1 root root 0 Dec 21 13:12 trigger
(b) When the attr does not denote the UID or GID, it defaulted to using
the parent uid or gid. This is incorrect as changing the parent
uid or gid will automatically change all its children.
# chgrp tracing /mnt/events/timer
# ls -ld /mnt/events/timer
drwxr-xr-x 2 root tracing 0 Dec 21 14:34 /mnt/events/timer
# ls -l /mnt/events/timer
total 0
-rw-r----- 1 root root 0 Dec 21 14:35 enable
-rw-r----- 1 root root 0 Dec 21 14:35 filter
drwxr-xr-x 2 root tracing 0 Dec 21 14:35 hrtimer_cancel
drwxr-xr-x 2 root tracing 0 Dec 21 14:35 hrtimer_expire_entry
drwxr-xr-x 2 root tracing 0 Dec 21 14:35 hrtimer_expire_exit
drwxr-xr-x 2 root tracing 0 Dec 21 14:35 hrtimer_init
drwxr-xr-x 2 root tracing 0 Dec 21 14:35 hrtimer_start
drwxr-xr-x 2 root tracing 0 Dec 21 14:35 itimer_expire
drwxr-xr-x 2 root tracing 0 Dec 21 14:35 itimer_state
drwxr-xr-x 2 root tracing 0 Dec 21 14:35 tick_stop
drwxr-xr-x 2 root tracing 0 Dec 21 14:35 timer_cancel
drwxr-xr-x 2 root tracing 0 Dec 21 14:35 timer_expire_entry
drwxr-xr-x 2 root tracing 0 Dec 21 14:35 timer_expire_exit
drwxr-xr-x 2 root tracing 0 Dec 21 14:35 timer_init
drwxr-xr-x 2 root tracing 0 Dec 21 14:35 timer_start
At first it was thought that this could be easily fixed by just making the
default ownership of the superblock when it was mounted. But this does not
handle the case of:
# chgrp tracing instances
# mkdir instances/foo
If the superblock was used, then the group ownership would be that of what
it was when it was mounted, when it should instead be "tracing".
Instead, set a flag for the top level eventfs directory ("events") to flag
which eventfs_inode belongs to it.
Since the "events" directory's dentry and inode are never freed, it does
not need to use its attr field to restore its mode and ownership. Use the
this eventfs_inode's attr as the default ownership for all the files and
directories underneath it.
When the events eventfs_inode is created, it sets its ownership to its
parent uid and gid. As the events directory is created at boot up before
it gets mounted, this will always be uid=0 and gid=0. If it's created via
an instance, then it will take the ownership of the instance directory.
When the file system is mounted, it will update all the gids if one is
specified. This will have a callback to update the events evenfs_inode's
default entries.
When a file or directory is created under the events directory, it will
walk the ei->dentry parents until it finds the evenfs_inode that belongs
to the events directory to retrieve the default uid and gid values.
Link: https://lore.kernel.org/all/CAHk-=wiwQtUHvzwyZucDq8=Gtw+AnwScyLhpFswrQ84PjhoGsg@mail.gmail.com/
Link: https://lore.kernel.org/linux-trace-kernel/20231221190757.7eddbca9@gandalf.local.home
Cc: stable@vger.kernel.org
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Dongliang Cui <cuidongliang390@gmail.com>
Cc: Hongyu Jin <hongyu.jin@unisoc.com>
Fixes: 0dfc852b6fe3 ("eventfs: Have event files and directories default to parent uid and gid")
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Tested-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-12-22 08:07:57 +08:00
|
|
|
/* If this is the events directory, update that too */
|
|
|
|
ti = get_tracefs(dentry->d_inode);
|
|
|
|
if (ti && (ti->flags & TRACEFS_EVENT_INODE))
|
|
|
|
eventfs_update_gid(dentry, gid);
|
|
|
|
|
2021-12-08 06:17:29 +08:00
|
|
|
if (!list_empty(&dentry->d_subdirs)) {
|
|
|
|
spin_unlock(&this_parent->d_lock);
|
|
|
|
spin_release(&dentry->d_lock.dep_map, _RET_IP_);
|
|
|
|
this_parent = dentry;
|
|
|
|
spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
|
|
|
|
goto repeat;
|
|
|
|
}
|
|
|
|
spin_unlock(&dentry->d_lock);
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* All done at this level ... ascend and resume the search.
|
|
|
|
*/
|
|
|
|
rcu_read_lock();
|
|
|
|
ascend:
|
|
|
|
if (this_parent != parent) {
|
|
|
|
struct dentry *child = this_parent;
|
|
|
|
this_parent = child->d_parent;
|
|
|
|
|
|
|
|
spin_unlock(&child->d_lock);
|
|
|
|
spin_lock(&this_parent->d_lock);
|
|
|
|
|
|
|
|
/* go into the first sibling still alive */
|
|
|
|
do {
|
|
|
|
next = child->d_child.next;
|
|
|
|
if (next == &this_parent->d_subdirs)
|
|
|
|
goto ascend;
|
|
|
|
child = list_entry(next, struct dentry, d_child);
|
|
|
|
} while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED));
|
|
|
|
rcu_read_unlock();
|
|
|
|
goto resume;
|
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
spin_unlock(&this_parent->d_lock);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2015-01-21 00:36:55 +08:00
|
|
|
static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts)
|
|
|
|
{
|
|
|
|
substring_t args[MAX_OPT_ARGS];
|
|
|
|
int option;
|
|
|
|
int token;
|
|
|
|
kuid_t uid;
|
|
|
|
kgid_t gid;
|
|
|
|
char *p;
|
|
|
|
|
2022-08-27 08:44:17 +08:00
|
|
|
opts->opts = 0;
|
2015-01-21 00:36:55 +08:00
|
|
|
opts->mode = TRACEFS_DEFAULT_MODE;
|
|
|
|
|
|
|
|
while ((p = strsep(&data, ",")) != NULL) {
|
|
|
|
if (!*p)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
token = match_token(p, tokens, args);
|
|
|
|
switch (token) {
|
|
|
|
case Opt_uid:
|
|
|
|
if (match_int(&args[0], &option))
|
|
|
|
return -EINVAL;
|
|
|
|
uid = make_kuid(current_user_ns(), option);
|
|
|
|
if (!uid_valid(uid))
|
|
|
|
return -EINVAL;
|
|
|
|
opts->uid = uid;
|
|
|
|
break;
|
|
|
|
case Opt_gid:
|
|
|
|
if (match_int(&args[0], &option))
|
|
|
|
return -EINVAL;
|
|
|
|
gid = make_kgid(current_user_ns(), option);
|
|
|
|
if (!gid_valid(gid))
|
|
|
|
return -EINVAL;
|
|
|
|
opts->gid = gid;
|
|
|
|
break;
|
|
|
|
case Opt_mode:
|
|
|
|
if (match_octal(&args[0], &option))
|
|
|
|
return -EINVAL;
|
|
|
|
opts->mode = option & S_IALLUGO;
|
|
|
|
break;
|
|
|
|
/*
|
|
|
|
* We might like to report bad mount options here;
|
|
|
|
* but traditionally tracefs has ignored all mount options
|
|
|
|
*/
|
|
|
|
}
|
2022-08-27 08:44:17 +08:00
|
|
|
|
|
|
|
opts->opts |= BIT(token);
|
2015-01-21 00:36:55 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2022-08-27 08:44:17 +08:00
|
|
|
static int tracefs_apply_options(struct super_block *sb, bool remount)
|
2015-01-21 00:36:55 +08:00
|
|
|
{
|
|
|
|
struct tracefs_fs_info *fsi = sb->s_fs_info;
|
2021-12-08 23:27:31 +08:00
|
|
|
struct inode *inode = d_inode(sb->s_root);
|
2015-01-21 00:36:55 +08:00
|
|
|
struct tracefs_mount_opts *opts = &fsi->mount_opts;
|
2023-08-18 08:00:31 +08:00
|
|
|
umode_t tmp_mode;
|
2015-01-21 00:36:55 +08:00
|
|
|
|
2022-08-27 08:44:17 +08:00
|
|
|
/*
|
|
|
|
* On remount, only reset mode/uid/gid if they were provided as mount
|
|
|
|
* options.
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (!remount || opts->opts & BIT(Opt_mode)) {
|
2023-08-18 08:00:31 +08:00
|
|
|
tmp_mode = READ_ONCE(inode->i_mode) & ~S_IALLUGO;
|
|
|
|
tmp_mode |= opts->mode;
|
|
|
|
WRITE_ONCE(inode->i_mode, tmp_mode);
|
2022-08-27 08:44:17 +08:00
|
|
|
}
|
2015-01-21 00:36:55 +08:00
|
|
|
|
2022-08-27 08:44:17 +08:00
|
|
|
if (!remount || opts->opts & BIT(Opt_uid))
|
|
|
|
inode->i_uid = opts->uid;
|
2022-02-26 04:34:26 +08:00
|
|
|
|
2022-08-27 08:44:17 +08:00
|
|
|
if (!remount || opts->opts & BIT(Opt_gid)) {
|
|
|
|
/* Set all the group ids to the mount option */
|
|
|
|
set_gid(sb->s_root, opts->gid);
|
|
|
|
}
|
2015-01-21 00:36:55 +08:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int tracefs_remount(struct super_block *sb, int *flags, char *data)
|
|
|
|
{
|
|
|
|
int err;
|
|
|
|
struct tracefs_fs_info *fsi = sb->s_fs_info;
|
|
|
|
|
|
|
|
sync_filesystem(sb);
|
|
|
|
err = tracefs_parse_options(data, &fsi->mount_opts);
|
|
|
|
if (err)
|
|
|
|
goto fail;
|
|
|
|
|
2022-08-27 08:44:17 +08:00
|
|
|
tracefs_apply_options(sb, true);
|
2015-01-21 00:36:55 +08:00
|
|
|
|
|
|
|
fail:
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int tracefs_show_options(struct seq_file *m, struct dentry *root)
|
|
|
|
{
|
|
|
|
struct tracefs_fs_info *fsi = root->d_sb->s_fs_info;
|
|
|
|
struct tracefs_mount_opts *opts = &fsi->mount_opts;
|
|
|
|
|
|
|
|
if (!uid_eq(opts->uid, GLOBAL_ROOT_UID))
|
|
|
|
seq_printf(m, ",uid=%u",
|
|
|
|
from_kuid_munged(&init_user_ns, opts->uid));
|
|
|
|
if (!gid_eq(opts->gid, GLOBAL_ROOT_GID))
|
|
|
|
seq_printf(m, ",gid=%u",
|
|
|
|
from_kgid_munged(&init_user_ns, opts->gid));
|
|
|
|
if (opts->mode != TRACEFS_DEFAULT_MODE)
|
|
|
|
seq_printf(m, ",mode=%o", opts->mode);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct super_operations tracefs_super_operations = {
|
2023-07-29 02:20:44 +08:00
|
|
|
.alloc_inode = tracefs_alloc_inode,
|
|
|
|
.free_inode = tracefs_free_inode,
|
|
|
|
.drop_inode = generic_delete_inode,
|
2015-01-21 00:36:55 +08:00
|
|
|
.statfs = simple_statfs,
|
|
|
|
.remount_fs = tracefs_remount,
|
|
|
|
.show_options = tracefs_show_options,
|
|
|
|
};
|
|
|
|
|
eventfs: Move tracing/events to eventfs
Up until now, /sys/kernel/tracing/events was no different than any other
part of tracefs. The files and directories within the events directory was
created when the tracefs was mounted, and also created for the instances in
/sys/kernel/tracing/instances/<instance>/events. Most of these files and
directories will never be referenced. Since there are thousands of these
files and directories they spend their time wasting precious memory
resources.
Move the "events" directory to the new eventfs. The eventfs will take the
meta data of the events that they represent and store that. When the files
in the events directory are referenced, the dentry and inodes to represent
them are then created. When the files are no longer referenced, they are
freed. This saves the precious memory resources that were wasted on these
seldom referenced dentries and inodes.
Running the following:
~# cat /proc/meminfo /proc/slabinfo > before.out
~# mkdir /sys/kernel/tracing/instances/foo
~# cat /proc/meminfo /proc/slabinfo > after.out
to test the changes produces the following deltas:
Before this change:
Before after deltas for meminfo:
MemFree: -32260
MemAvailable: -21496
KReclaimable: 21528
Slab: 22440
SReclaimable: 21528
SUnreclaim: 912
VmallocUsed: 16
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache: 14472 [* 1184 = 17134848]
buffer_head: 24 [* 168 = 4032]
hmem_inode_cache: 28 [* 1480 = 41440]
dentry: 14450 [* 312 = 4508400]
lsm_inode_cache: 14453 [* 32 = 462496]
vma_lock: 11 [* 152 = 1672]
vm_area_struct: 2 [* 184 = 368]
trace_event_file: 1748 [* 88 = 153824]
kmalloc-256: 1072 [* 256 = 274432]
kmalloc-64: 2842 [* 64 = 181888]
Total slab additions in size: 22,763,400 bytes
With this change:
Before after deltas for meminfo:
MemFree: -12600
MemAvailable: -12580
Cached: 24
Active: 12
Inactive: 68
Inactive(anon): 48
Active(file): 12
Inactive(file): 20
Dirty: -4
AnonPages: 68
KReclaimable: 12
Slab: 1856
SReclaimable: 12
SUnreclaim: 1844
KernelStack: 16
PageTables: 36
VmallocUsed: 16
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache: 108 [* 1184 = 127872]
buffer_head: 24 [* 168 = 4032]
hmem_inode_cache: 18 [* 1480 = 26640]
dentry: 127 [* 312 = 39624]
lsm_inode_cache: 152 [* 32 = 4864]
vma_lock: 67 [* 152 = 10184]
vm_area_struct: -12 [* 184 = -2208]
trace_event_file: 1764 [* 96 = 169344]
kmalloc-96: 14322 [* 96 = 1374912]
kmalloc-64: 2814 [* 64 = 180096]
kmalloc-32: 1103 [* 32 = 35296]
kmalloc-16: 2308 [* 16 = 36928]
kmalloc-8: 12800 [* 8 = 102400]
Total slab additions in size: 2,109,984 bytes
Which is a savings of 20,653,416 bytes (20 MB) per tracing instance.
Link: https://lkml.kernel.org/r/1690568452-46553-10-git-send-email-akaher@vmware.com
Signed-off-by: Ajay Kaher <akaher@vmware.com>
Co-developed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Tested-by: Ching-lin Yu <chinglinyu@google.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-07-29 02:20:51 +08:00
|
|
|
static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode)
|
|
|
|
{
|
|
|
|
struct tracefs_inode *ti;
|
|
|
|
|
|
|
|
if (!dentry || !inode)
|
|
|
|
return;
|
|
|
|
|
|
|
|
ti = get_tracefs(inode);
|
|
|
|
if (ti && ti->flags & TRACEFS_EVENT_INODE)
|
eventfs: Remove eventfs_file and just use eventfs_inode
Instead of having a descriptor for every file represented in the eventfs
directory, only have the directory itself represented. Change the API to
send in a list of entries that represent all the files in the directory
(but not other directories). The entry list contains a name and a callback
function that will be used to create the files when they are accessed.
struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent,
const struct eventfs_entry *entries,
int size, void *data);
is used for the top level eventfs directory, and returns an eventfs_inode
that will be used by:
struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent,
const struct eventfs_entry *entries,
int size, void *data);
where both of the above take an array of struct eventfs_entry entries for
every file that is in the directory.
The entries are defined by:
typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
const struct file_operations **fops);
struct eventfs_entry {
const char *name;
eventfs_callback callback;
};
Where the name is the name of the file and the callback gets called when
the file is being created. The callback passes in the name (in case the
same callback is used for multiple files), a pointer to the mode, data and
fops. The data will be pointing to the data that was passed in
eventfs_create_dir() or eventfs_create_events_dir() but may be overridden
to point to something else, as it will be used to point to the
inode->i_private that is created. The information passed back from the
callback is used to create the dentry/inode.
If the callback fills the data and the file should be created, it must
return a positive number. On zero or negative, the file is ignored.
This logic may also be used as a prototype to convert entire pseudo file
systems into just-in-time allocation.
The "show_events_dentry" file has been updated to show the directories,
and any files they have.
With just the eventfs_file allocations:
Before after deltas for meminfo (in kB):
MemFree: -14360
MemAvailable: -14260
Buffers: 40
Cached: 24
Active: 44
Inactive: 48
Inactive(anon): 28
Active(file): 44
Inactive(file): 20
Dirty: -4
AnonPages: 28
Mapped: 4
KReclaimable: 132
Slab: 1604
SReclaimable: 132
SUnreclaim: 1472
Committed_AS: 12
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
ext4_inode_cache 27 [* 1184 = 31968 ]
extent_status 102 [* 40 = 4080 ]
tracefs_inode_cache 144 [* 656 = 94464 ]
buffer_head 39 [* 104 = 4056 ]
shmem_inode_cache 49 [* 800 = 39200 ]
filp -53 [* 256 = -13568 ]
dentry 251 [* 192 = 48192 ]
lsm_file_cache 277 [* 32 = 8864 ]
vm_area_struct -14 [* 184 = -2576 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k 35 [* 1024 = 35840 ]
kmalloc-256 49 [* 256 = 12544 ]
kmalloc-192 -28 [* 192 = -5376 ]
kmalloc-128 -30 [* 128 = -3840 ]
kmalloc-96 10581 [* 96 = 1015776 ]
kmalloc-64 3056 [* 64 = 195584 ]
kmalloc-32 1291 [* 32 = 41312 ]
kmalloc-16 2310 [* 16 = 36960 ]
kmalloc-8 9216 [* 8 = 73728 ]
Free memory dropped by 14,360 kB
Available memory dropped by 14,260 kB
Total slab additions in size: 1,771,032 bytes
With this change:
Before after deltas for meminfo (in kB):
MemFree: -12084
MemAvailable: -11976
Buffers: 32
Cached: 32
Active: 72
Inactive: 168
Inactive(anon): 176
Active(file): 72
Inactive(file): -8
Dirty: 24
AnonPages: 196
Mapped: 8
KReclaimable: 148
Slab: 836
SReclaimable: 148
SUnreclaim: 688
Committed_AS: 324
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache 144 [* 656 = 94464 ]
shmem_inode_cache -23 [* 800 = -18400 ]
filp -92 [* 256 = -23552 ]
dentry 179 [* 192 = 34368 ]
lsm_file_cache -3 [* 32 = -96 ]
vm_area_struct -13 [* 184 = -2392 ]
trace_event_file 1748 [* 88 = 153824 ]
kmalloc-1k -49 [* 1024 = -50176 ]
kmalloc-256 -27 [* 256 = -6912 ]
kmalloc-128 1864 [* 128 = 238592 ]
kmalloc-64 4685 [* 64 = 299840 ]
kmalloc-32 -72 [* 32 = -2304 ]
kmalloc-16 256 [* 16 = 4096 ]
total = 721352
Free memory dropped by 12,084 kB
Available memory dropped by 11,976 kB
Total slab additions in size: 721,352 bytes
That's over 2 MB in savings per instance for free and available memory,
and over 1 MB in savings per instance of slab memory.
Link: https://lore.kernel.org/linux-trace-kernel/20231003184059.4924468e@gandalf.local.home
Link: https://lore.kernel.org/linux-trace-kernel/20231004165007.43d79161@gandalf.local.home
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ajay Kaher <akaher@vmware.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-10-05 04:50:07 +08:00
|
|
|
eventfs_set_ei_status_free(ti, dentry);
|
eventfs: Move tracing/events to eventfs
Up until now, /sys/kernel/tracing/events was no different than any other
part of tracefs. The files and directories within the events directory was
created when the tracefs was mounted, and also created for the instances in
/sys/kernel/tracing/instances/<instance>/events. Most of these files and
directories will never be referenced. Since there are thousands of these
files and directories they spend their time wasting precious memory
resources.
Move the "events" directory to the new eventfs. The eventfs will take the
meta data of the events that they represent and store that. When the files
in the events directory are referenced, the dentry and inodes to represent
them are then created. When the files are no longer referenced, they are
freed. This saves the precious memory resources that were wasted on these
seldom referenced dentries and inodes.
Running the following:
~# cat /proc/meminfo /proc/slabinfo > before.out
~# mkdir /sys/kernel/tracing/instances/foo
~# cat /proc/meminfo /proc/slabinfo > after.out
to test the changes produces the following deltas:
Before this change:
Before after deltas for meminfo:
MemFree: -32260
MemAvailable: -21496
KReclaimable: 21528
Slab: 22440
SReclaimable: 21528
SUnreclaim: 912
VmallocUsed: 16
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache: 14472 [* 1184 = 17134848]
buffer_head: 24 [* 168 = 4032]
hmem_inode_cache: 28 [* 1480 = 41440]
dentry: 14450 [* 312 = 4508400]
lsm_inode_cache: 14453 [* 32 = 462496]
vma_lock: 11 [* 152 = 1672]
vm_area_struct: 2 [* 184 = 368]
trace_event_file: 1748 [* 88 = 153824]
kmalloc-256: 1072 [* 256 = 274432]
kmalloc-64: 2842 [* 64 = 181888]
Total slab additions in size: 22,763,400 bytes
With this change:
Before after deltas for meminfo:
MemFree: -12600
MemAvailable: -12580
Cached: 24
Active: 12
Inactive: 68
Inactive(anon): 48
Active(file): 12
Inactive(file): 20
Dirty: -4
AnonPages: 68
KReclaimable: 12
Slab: 1856
SReclaimable: 12
SUnreclaim: 1844
KernelStack: 16
PageTables: 36
VmallocUsed: 16
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache: 108 [* 1184 = 127872]
buffer_head: 24 [* 168 = 4032]
hmem_inode_cache: 18 [* 1480 = 26640]
dentry: 127 [* 312 = 39624]
lsm_inode_cache: 152 [* 32 = 4864]
vma_lock: 67 [* 152 = 10184]
vm_area_struct: -12 [* 184 = -2208]
trace_event_file: 1764 [* 96 = 169344]
kmalloc-96: 14322 [* 96 = 1374912]
kmalloc-64: 2814 [* 64 = 180096]
kmalloc-32: 1103 [* 32 = 35296]
kmalloc-16: 2308 [* 16 = 36928]
kmalloc-8: 12800 [* 8 = 102400]
Total slab additions in size: 2,109,984 bytes
Which is a savings of 20,653,416 bytes (20 MB) per tracing instance.
Link: https://lkml.kernel.org/r/1690568452-46553-10-git-send-email-akaher@vmware.com
Signed-off-by: Ajay Kaher <akaher@vmware.com>
Co-developed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Tested-by: Ching-lin Yu <chinglinyu@google.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-07-29 02:20:51 +08:00
|
|
|
iput(inode);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct dentry_operations tracefs_dentry_operations = {
|
|
|
|
.d_iput = tracefs_dentry_iput,
|
|
|
|
};
|
|
|
|
|
2015-01-21 00:36:55 +08:00
|
|
|
static int trace_fill_super(struct super_block *sb, void *data, int silent)
|
|
|
|
{
|
2017-03-26 12:15:37 +08:00
|
|
|
static const struct tree_descr trace_files[] = {{""}};
|
2015-01-21 00:36:55 +08:00
|
|
|
struct tracefs_fs_info *fsi;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
fsi = kzalloc(sizeof(struct tracefs_fs_info), GFP_KERNEL);
|
|
|
|
sb->s_fs_info = fsi;
|
|
|
|
if (!fsi) {
|
|
|
|
err = -ENOMEM;
|
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
|
|
|
|
err = tracefs_parse_options(data, &fsi->mount_opts);
|
|
|
|
if (err)
|
|
|
|
goto fail;
|
|
|
|
|
|
|
|
err = simple_fill_super(sb, TRACEFS_MAGIC, trace_files);
|
|
|
|
if (err)
|
|
|
|
goto fail;
|
|
|
|
|
|
|
|
sb->s_op = &tracefs_super_operations;
|
eventfs: Move tracing/events to eventfs
Up until now, /sys/kernel/tracing/events was no different than any other
part of tracefs. The files and directories within the events directory was
created when the tracefs was mounted, and also created for the instances in
/sys/kernel/tracing/instances/<instance>/events. Most of these files and
directories will never be referenced. Since there are thousands of these
files and directories they spend their time wasting precious memory
resources.
Move the "events" directory to the new eventfs. The eventfs will take the
meta data of the events that they represent and store that. When the files
in the events directory are referenced, the dentry and inodes to represent
them are then created. When the files are no longer referenced, they are
freed. This saves the precious memory resources that were wasted on these
seldom referenced dentries and inodes.
Running the following:
~# cat /proc/meminfo /proc/slabinfo > before.out
~# mkdir /sys/kernel/tracing/instances/foo
~# cat /proc/meminfo /proc/slabinfo > after.out
to test the changes produces the following deltas:
Before this change:
Before after deltas for meminfo:
MemFree: -32260
MemAvailable: -21496
KReclaimable: 21528
Slab: 22440
SReclaimable: 21528
SUnreclaim: 912
VmallocUsed: 16
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache: 14472 [* 1184 = 17134848]
buffer_head: 24 [* 168 = 4032]
hmem_inode_cache: 28 [* 1480 = 41440]
dentry: 14450 [* 312 = 4508400]
lsm_inode_cache: 14453 [* 32 = 462496]
vma_lock: 11 [* 152 = 1672]
vm_area_struct: 2 [* 184 = 368]
trace_event_file: 1748 [* 88 = 153824]
kmalloc-256: 1072 [* 256 = 274432]
kmalloc-64: 2842 [* 64 = 181888]
Total slab additions in size: 22,763,400 bytes
With this change:
Before after deltas for meminfo:
MemFree: -12600
MemAvailable: -12580
Cached: 24
Active: 12
Inactive: 68
Inactive(anon): 48
Active(file): 12
Inactive(file): 20
Dirty: -4
AnonPages: 68
KReclaimable: 12
Slab: 1856
SReclaimable: 12
SUnreclaim: 1844
KernelStack: 16
PageTables: 36
VmallocUsed: 16
Before after deltas for slabinfo:
<slab>: <objects> [ * <size> = <total>]
tracefs_inode_cache: 108 [* 1184 = 127872]
buffer_head: 24 [* 168 = 4032]
hmem_inode_cache: 18 [* 1480 = 26640]
dentry: 127 [* 312 = 39624]
lsm_inode_cache: 152 [* 32 = 4864]
vma_lock: 67 [* 152 = 10184]
vm_area_struct: -12 [* 184 = -2208]
trace_event_file: 1764 [* 96 = 169344]
kmalloc-96: 14322 [* 96 = 1374912]
kmalloc-64: 2814 [* 64 = 180096]
kmalloc-32: 1103 [* 32 = 35296]
kmalloc-16: 2308 [* 16 = 36928]
kmalloc-8: 12800 [* 8 = 102400]
Total slab additions in size: 2,109,984 bytes
Which is a savings of 20,653,416 bytes (20 MB) per tracing instance.
Link: https://lkml.kernel.org/r/1690568452-46553-10-git-send-email-akaher@vmware.com
Signed-off-by: Ajay Kaher <akaher@vmware.com>
Co-developed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Tested-by: Ching-lin Yu <chinglinyu@google.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2023-07-29 02:20:51 +08:00
|
|
|
sb->s_d_op = &tracefs_dentry_operations;
|
2015-01-21 00:36:55 +08:00
|
|
|
|
2022-08-27 08:44:17 +08:00
|
|
|
tracefs_apply_options(sb, false);
|
2015-01-21 00:36:55 +08:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
fail:
|
|
|
|
kfree(fsi);
|
|
|
|
sb->s_fs_info = NULL;
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct dentry *trace_mount(struct file_system_type *fs_type,
|
|
|
|
int flags, const char *dev_name,
|
|
|
|
void *data)
|
|
|
|
{
|
|
|
|
return mount_single(fs_type, flags, data, trace_fill_super);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct file_system_type trace_fs_type = {
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.name = "tracefs",
|
|
|
|
.mount = trace_mount,
|
|
|
|
.kill_sb = kill_litter_super,
|
|
|
|
};
|
|
|
|
MODULE_ALIAS_FS("tracefs");
|
|
|
|
|
2023-07-29 02:20:45 +08:00
|
|
|
struct dentry *tracefs_start_creating(const char *name, struct dentry *parent)
|
2015-01-21 00:36:55 +08:00
|
|
|
{
|
|
|
|
struct dentry *dentry;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
pr_debug("tracefs: creating file '%s'\n",name);
|
|
|
|
|
|
|
|
error = simple_pin_fs(&trace_fs_type, &tracefs_mount,
|
|
|
|
&tracefs_mount_count);
|
|
|
|
if (error)
|
|
|
|
return ERR_PTR(error);
|
|
|
|
|
|
|
|
/* If the parent is not specified, we create it in the root.
|
|
|
|
* We need the root dentry to do this, which is in the super
|
|
|
|
* block. A pointer to that is in the struct vfsmount that we
|
|
|
|
* have around.
|
|
|
|
*/
|
|
|
|
if (!parent)
|
|
|
|
parent = tracefs_mount->mnt_root;
|
|
|
|
|
2021-12-08 23:27:31 +08:00
|
|
|
inode_lock(d_inode(parent));
|
|
|
|
if (unlikely(IS_DEADDIR(d_inode(parent))))
|
2019-11-18 22:43:10 +08:00
|
|
|
dentry = ERR_PTR(-ENOENT);
|
|
|
|
else
|
|
|
|
dentry = lookup_one_len(name, parent, strlen(name));
|
2021-12-08 23:27:31 +08:00
|
|
|
if (!IS_ERR(dentry) && d_inode(dentry)) {
|
2015-01-21 00:36:55 +08:00
|
|
|
dput(dentry);
|
|
|
|
dentry = ERR_PTR(-EEXIST);
|
|
|
|
}
|
2015-11-05 06:33:17 +08:00
|
|
|
|
|
|
|
if (IS_ERR(dentry)) {
|
2021-12-08 23:27:31 +08:00
|
|
|
inode_unlock(d_inode(parent));
|
2015-11-05 06:33:17 +08:00
|
|
|
simple_release_fs(&tracefs_mount, &tracefs_mount_count);
|
|
|
|
}
|
|
|
|
|
2015-01-21 00:36:55 +08:00
|
|
|
return dentry;
|
|
|
|
}
|
|
|
|
|
2023-07-29 02:20:45 +08:00
|
|
|
struct dentry *tracefs_failed_creating(struct dentry *dentry)
|
2015-01-21 00:36:55 +08:00
|
|
|
{
|
2021-12-08 23:27:31 +08:00
|
|
|
inode_unlock(d_inode(dentry->d_parent));
|
2015-01-21 00:36:55 +08:00
|
|
|
dput(dentry);
|
|
|
|
simple_release_fs(&tracefs_mount, &tracefs_mount_count);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2023-07-29 02:20:45 +08:00
|
|
|
struct dentry *tracefs_end_creating(struct dentry *dentry)
|
2015-01-21 00:36:55 +08:00
|
|
|
{
|
2021-12-08 23:27:31 +08:00
|
|
|
inode_unlock(d_inode(dentry->d_parent));
|
2015-01-21 00:36:55 +08:00
|
|
|
return dentry;
|
|
|
|
}
|
|
|
|
|
2023-07-29 02:20:49 +08:00
|
|
|
/**
|
|
|
|
* eventfs_start_creating - start the process of creating a dentry
|
|
|
|
* @name: Name of the file created for the dentry
|
|
|
|
* @parent: The parent dentry where this dentry will be created
|
|
|
|
*
|
|
|
|
* This is a simple helper function for the dynamically created eventfs
|
|
|
|
* files. When the directory of the eventfs files are accessed, their
|
|
|
|
* dentries are created on the fly. This function is used to start that
|
|
|
|
* process.
|
|
|
|
*/
|
|
|
|
struct dentry *eventfs_start_creating(const char *name, struct dentry *parent)
|
|
|
|
{
|
|
|
|
struct dentry *dentry;
|
|
|
|
int error;
|
|
|
|
|
2023-11-22 07:10:06 +08:00
|
|
|
/* Must always have a parent. */
|
|
|
|
if (WARN_ON_ONCE(!parent))
|
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
|
2023-07-29 02:20:49 +08:00
|
|
|
error = simple_pin_fs(&trace_fs_type, &tracefs_mount,
|
|
|
|
&tracefs_mount_count);
|
|
|
|
if (error)
|
|
|
|
return ERR_PTR(error);
|
|
|
|
|
|
|
|
if (unlikely(IS_DEADDIR(parent->d_inode)))
|
|
|
|
dentry = ERR_PTR(-ENOENT);
|
|
|
|
else
|
|
|
|
dentry = lookup_one_len(name, parent, strlen(name));
|
|
|
|
|
|
|
|
if (!IS_ERR(dentry) && dentry->d_inode) {
|
|
|
|
dput(dentry);
|
|
|
|
dentry = ERR_PTR(-EEXIST);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (IS_ERR(dentry))
|
|
|
|
simple_release_fs(&tracefs_mount, &tracefs_mount_count);
|
|
|
|
|
|
|
|
return dentry;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* eventfs_failed_creating - clean up a failed eventfs dentry creation
|
|
|
|
* @dentry: The dentry to clean up
|
|
|
|
*
|
|
|
|
* If after calling eventfs_start_creating(), a failure is detected, the
|
|
|
|
* resources created by eventfs_start_creating() needs to be cleaned up. In
|
|
|
|
* that case, this function should be called to perform that clean up.
|
|
|
|
*/
|
|
|
|
struct dentry *eventfs_failed_creating(struct dentry *dentry)
|
|
|
|
{
|
|
|
|
dput(dentry);
|
|
|
|
simple_release_fs(&tracefs_mount, &tracefs_mount_count);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* eventfs_end_creating - Finish the process of creating a eventfs dentry
|
|
|
|
* @dentry: The dentry that has successfully been created.
|
|
|
|
*
|
|
|
|
* This function is currently just a place holder to match
|
|
|
|
* eventfs_start_creating(). In case any synchronization needs to be added,
|
|
|
|
* this function will be used to implement that without having to modify
|
|
|
|
* the callers of eventfs_start_creating().
|
|
|
|
*/
|
|
|
|
struct dentry *eventfs_end_creating(struct dentry *dentry)
|
|
|
|
{
|
|
|
|
return dentry;
|
|
|
|
}
|
|
|
|
|
2015-01-21 00:36:55 +08:00
|
|
|
/**
|
|
|
|
* tracefs_create_file - create a file in the tracefs filesystem
|
|
|
|
* @name: a pointer to a string containing the name of the file to create.
|
|
|
|
* @mode: the permission that the file should have.
|
|
|
|
* @parent: a pointer to the parent dentry for this file. This should be a
|
|
|
|
* directory dentry if set. If this parameter is NULL, then the
|
|
|
|
* file will be created in the root of the tracefs filesystem.
|
|
|
|
* @data: a pointer to something that the caller will want to get to later
|
|
|
|
* on. The inode.i_private pointer will point to this value on
|
|
|
|
* the open() call.
|
|
|
|
* @fops: a pointer to a struct file_operations that should be used for
|
|
|
|
* this file.
|
|
|
|
*
|
|
|
|
* This is the basic "create a file" function for tracefs. It allows for a
|
|
|
|
* wide range of flexibility in creating a file, or a directory (if you want
|
|
|
|
* to create a directory, the tracefs_create_dir() function is
|
|
|
|
* recommended to be used instead.)
|
|
|
|
*
|
|
|
|
* This function will return a pointer to a dentry if it succeeds. This
|
|
|
|
* pointer must be passed to the tracefs_remove() function when the file is
|
|
|
|
* to be removed (no automatic cleanup happens if your module is unloaded,
|
|
|
|
* you are responsible here.) If an error occurs, %NULL will be returned.
|
|
|
|
*
|
|
|
|
* If tracefs is not enabled in the kernel, the value -%ENODEV will be
|
|
|
|
* returned.
|
|
|
|
*/
|
|
|
|
struct dentry *tracefs_create_file(const char *name, umode_t mode,
|
|
|
|
struct dentry *parent, void *data,
|
|
|
|
const struct file_operations *fops)
|
|
|
|
{
|
|
|
|
struct dentry *dentry;
|
|
|
|
struct inode *inode;
|
|
|
|
|
2019-10-12 08:41:41 +08:00
|
|
|
if (security_locked_down(LOCKDOWN_TRACEFS))
|
|
|
|
return NULL;
|
|
|
|
|
2015-01-21 00:36:55 +08:00
|
|
|
if (!(mode & S_IFMT))
|
|
|
|
mode |= S_IFREG;
|
|
|
|
BUG_ON(!S_ISREG(mode));
|
2023-07-29 02:20:45 +08:00
|
|
|
dentry = tracefs_start_creating(name, parent);
|
2015-01-21 00:36:55 +08:00
|
|
|
|
|
|
|
if (IS_ERR(dentry))
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
inode = tracefs_get_inode(dentry->d_sb);
|
|
|
|
if (unlikely(!inode))
|
2023-07-29 02:20:45 +08:00
|
|
|
return tracefs_failed_creating(dentry);
|
2015-01-21 00:36:55 +08:00
|
|
|
|
|
|
|
inode->i_mode = mode;
|
2019-10-12 01:54:58 +08:00
|
|
|
inode->i_fop = fops ? fops : &tracefs_file_operations;
|
2015-01-21 00:36:55 +08:00
|
|
|
inode->i_private = data;
|
2021-12-08 20:57:20 +08:00
|
|
|
inode->i_uid = d_inode(dentry->d_parent)->i_uid;
|
|
|
|
inode->i_gid = d_inode(dentry->d_parent)->i_gid;
|
2015-01-21 00:36:55 +08:00
|
|
|
d_instantiate(dentry, inode);
|
2021-12-08 23:27:31 +08:00
|
|
|
fsnotify_create(d_inode(dentry->d_parent), dentry);
|
2023-07-29 02:20:45 +08:00
|
|
|
return tracefs_end_creating(dentry);
|
2015-01-21 00:36:55 +08:00
|
|
|
}
|
|
|
|
|
2015-01-21 23:01:39 +08:00
|
|
|
static struct dentry *__create_dir(const char *name, struct dentry *parent,
|
|
|
|
const struct inode_operations *ops)
|
|
|
|
{
|
2023-07-29 02:20:45 +08:00
|
|
|
struct dentry *dentry = tracefs_start_creating(name, parent);
|
2015-01-21 23:01:39 +08:00
|
|
|
struct inode *inode;
|
|
|
|
|
|
|
|
if (IS_ERR(dentry))
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
inode = tracefs_get_inode(dentry->d_sb);
|
|
|
|
if (unlikely(!inode))
|
2023-07-29 02:20:45 +08:00
|
|
|
return tracefs_failed_creating(dentry);
|
2015-01-21 23:01:39 +08:00
|
|
|
|
2021-08-18 23:24:50 +08:00
|
|
|
/* Do not set bits for OTH */
|
|
|
|
inode->i_mode = S_IFDIR | S_IRWXU | S_IRUSR| S_IRGRP | S_IXUSR | S_IXGRP;
|
2015-01-21 23:01:39 +08:00
|
|
|
inode->i_op = ops;
|
|
|
|
inode->i_fop = &simple_dir_operations;
|
2021-12-08 20:57:20 +08:00
|
|
|
inode->i_uid = d_inode(dentry->d_parent)->i_uid;
|
|
|
|
inode->i_gid = d_inode(dentry->d_parent)->i_gid;
|
2015-01-21 23:01:39 +08:00
|
|
|
|
|
|
|
/* directory inodes start off with i_nlink == 2 (for "." entry) */
|
|
|
|
inc_nlink(inode);
|
|
|
|
d_instantiate(dentry, inode);
|
2021-12-08 23:27:31 +08:00
|
|
|
inc_nlink(d_inode(dentry->d_parent));
|
|
|
|
fsnotify_mkdir(d_inode(dentry->d_parent), dentry);
|
2023-07-29 02:20:45 +08:00
|
|
|
return tracefs_end_creating(dentry);
|
2015-01-21 23:01:39 +08:00
|
|
|
}
|
|
|
|
|
2015-01-21 00:36:55 +08:00
|
|
|
/**
|
|
|
|
* tracefs_create_dir - create a directory in the tracefs filesystem
|
|
|
|
* @name: a pointer to a string containing the name of the directory to
|
|
|
|
* create.
|
|
|
|
* @parent: a pointer to the parent dentry for this file. This should be a
|
|
|
|
* directory dentry if set. If this parameter is NULL, then the
|
|
|
|
* directory will be created in the root of the tracefs filesystem.
|
|
|
|
*
|
|
|
|
* This function creates a directory in tracefs with the given name.
|
|
|
|
*
|
|
|
|
* This function will return a pointer to a dentry if it succeeds. This
|
|
|
|
* pointer must be passed to the tracefs_remove() function when the file is
|
|
|
|
* to be removed. If an error occurs, %NULL will be returned.
|
|
|
|
*
|
|
|
|
* If tracing is not enabled in the kernel, the value -%ENODEV will be
|
|
|
|
* returned.
|
|
|
|
*/
|
|
|
|
struct dentry *tracefs_create_dir(const char *name, struct dentry *parent)
|
|
|
|
{
|
2023-09-06 02:26:08 +08:00
|
|
|
if (security_locked_down(LOCKDOWN_TRACEFS))
|
|
|
|
return NULL;
|
|
|
|
|
2015-01-21 23:01:39 +08:00
|
|
|
return __create_dir(name, parent, &simple_dir_inode_operations);
|
|
|
|
}
|
2015-01-21 00:36:55 +08:00
|
|
|
|
2015-01-21 23:01:39 +08:00
|
|
|
/**
|
|
|
|
* tracefs_create_instance_dir - create the tracing instances directory
|
|
|
|
* @name: The name of the instances directory to create
|
|
|
|
* @parent: The parent directory that the instances directory will exist
|
|
|
|
* @mkdir: The function to call when a mkdir is performed.
|
|
|
|
* @rmdir: The function to call when a rmdir is performed.
|
|
|
|
*
|
|
|
|
* Only one instances directory is allowed.
|
|
|
|
*
|
2022-06-05 17:27:29 +08:00
|
|
|
* The instances directory is special as it allows for mkdir and rmdir
|
2015-01-21 23:01:39 +08:00
|
|
|
* to be done by userspace. When a mkdir or rmdir is performed, the inode
|
2021-03-24 01:49:35 +08:00
|
|
|
* locks are released and the methods passed in (@mkdir and @rmdir) are
|
2015-01-21 23:01:39 +08:00
|
|
|
* called without locks and with the name of the directory being created
|
|
|
|
* within the instances directory.
|
|
|
|
*
|
|
|
|
* Returns the dentry of the instances directory.
|
|
|
|
*/
|
2018-07-26 01:19:01 +08:00
|
|
|
__init struct dentry *tracefs_create_instance_dir(const char *name,
|
|
|
|
struct dentry *parent,
|
2015-01-21 23:01:39 +08:00
|
|
|
int (*mkdir)(const char *name),
|
|
|
|
int (*rmdir)(const char *name))
|
|
|
|
{
|
|
|
|
struct dentry *dentry;
|
|
|
|
|
|
|
|
/* Only allow one instance of the instances directory. */
|
|
|
|
if (WARN_ON(tracefs_ops.mkdir || tracefs_ops.rmdir))
|
2015-01-21 00:36:55 +08:00
|
|
|
return NULL;
|
|
|
|
|
2015-01-21 23:01:39 +08:00
|
|
|
dentry = __create_dir(name, parent, &tracefs_dir_inode_operations);
|
|
|
|
if (!dentry)
|
|
|
|
return NULL;
|
2015-01-21 00:36:55 +08:00
|
|
|
|
2015-01-21 23:01:39 +08:00
|
|
|
tracefs_ops.mkdir = mkdir;
|
|
|
|
tracefs_ops.rmdir = rmdir;
|
2015-01-21 00:36:55 +08:00
|
|
|
|
2015-01-21 23:01:39 +08:00
|
|
|
return dentry;
|
2015-01-21 00:36:55 +08:00
|
|
|
}
|
|
|
|
|
2019-11-18 22:43:10 +08:00
|
|
|
static void remove_one(struct dentry *victim)
|
2015-01-21 00:36:55 +08:00
|
|
|
{
|
2019-11-18 22:43:10 +08:00
|
|
|
simple_release_fs(&tracefs_mount, &tracefs_mount_count);
|
2015-01-21 00:36:55 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2019-11-18 22:43:10 +08:00
|
|
|
* tracefs_remove - recursively removes a directory
|
2015-01-21 00:36:55 +08:00
|
|
|
* @dentry: a pointer to a the dentry of the directory to be removed.
|
|
|
|
*
|
|
|
|
* This function recursively removes a directory tree in tracefs that
|
|
|
|
* was previously created with a call to another tracefs function
|
|
|
|
* (like tracefs_create_file() or variants thereof.)
|
|
|
|
*/
|
2019-11-18 22:43:10 +08:00
|
|
|
void tracefs_remove(struct dentry *dentry)
|
2015-01-21 00:36:55 +08:00
|
|
|
{
|
|
|
|
if (IS_ERR_OR_NULL(dentry))
|
|
|
|
return;
|
|
|
|
|
2019-11-18 22:43:10 +08:00
|
|
|
simple_pin_fs(&trace_fs_type, &tracefs_mount, &tracefs_mount_count);
|
|
|
|
simple_recursive_removal(dentry, remove_one);
|
|
|
|
simple_release_fs(&tracefs_mount, &tracefs_mount_count);
|
2015-01-21 00:36:55 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* tracefs_initialized - Tells whether tracefs has been registered
|
|
|
|
*/
|
|
|
|
bool tracefs_initialized(void)
|
|
|
|
{
|
|
|
|
return tracefs_registered;
|
|
|
|
}
|
|
|
|
|
2023-07-29 02:20:44 +08:00
|
|
|
static void init_once(void *foo)
|
|
|
|
{
|
|
|
|
struct tracefs_inode *ti = (struct tracefs_inode *) foo;
|
|
|
|
|
|
|
|
inode_init_once(&ti->vfs_inode);
|
|
|
|
}
|
|
|
|
|
2015-01-21 00:36:55 +08:00
|
|
|
static int __init tracefs_init(void)
|
|
|
|
{
|
|
|
|
int retval;
|
|
|
|
|
2023-07-29 02:20:44 +08:00
|
|
|
tracefs_inode_cachep = kmem_cache_create("tracefs_inode_cache",
|
|
|
|
sizeof(struct tracefs_inode),
|
|
|
|
0, (SLAB_RECLAIM_ACCOUNT|
|
|
|
|
SLAB_MEM_SPREAD|
|
|
|
|
SLAB_ACCOUNT),
|
|
|
|
init_once);
|
|
|
|
if (!tracefs_inode_cachep)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2015-05-14 06:35:41 +08:00
|
|
|
retval = sysfs_create_mount_point(kernel_kobj, "tracing");
|
|
|
|
if (retval)
|
2015-01-22 00:28:23 +08:00
|
|
|
return -EINVAL;
|
|
|
|
|
2015-01-21 00:36:55 +08:00
|
|
|
retval = register_filesystem(&trace_fs_type);
|
|
|
|
if (!retval)
|
|
|
|
tracefs_registered = true;
|
|
|
|
|
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
core_initcall(tracefs_init);
|