mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2025-01-15 18:34:47 +08:00
6a2b60b17b
Pull user namespace changes from Eric Biederman: "While small this set of changes is very significant with respect to containers in general and user namespaces in particular. The user space interface is now complete. This set of changes adds support for unprivileged users to create user namespaces and as a user namespace root to create other namespaces. The tyranny of supporting suid root preventing unprivileged users from using cool new kernel features is broken. This set of changes completes the work on setns, adding support for the pid, user, mount namespaces. This set of changes includes a bunch of basic pid namespace cleanups/simplifications. Of particular significance is the rework of the pid namespace cleanup so it no longer requires sending out tendrils into all kinds of unexpected cleanup paths for operation. At least one case of broken error handling is fixed by this cleanup. The files under /proc/<pid>/ns/ have been converted from regular files to magic symlinks which prevents incorrect caching by the VFS, ensuring the files always refer to the namespace the process is currently using and ensuring that the ptrace_mayaccess permission checks are always applied. The files under /proc/<pid>/ns/ have been given stable inode numbers so it is now possible to see if different processes share the same namespaces. Through the David Miller's net tree are changes to relax many of the permission checks in the networking stack to allowing the user namespace root to usefully use the networking stack. Similar changes for the mount namespace and the pid namespace are coming through my tree. Two small changes to add user namespace support were commited here adn in David Miller's -net tree so that I could complete the work on the /proc/<pid>/ns/ files in this tree. Work remains to make it safe to build user namespaces and 9p, afs, ceph, cifs, coda, gfs2, ncpfs, nfs, nfsd, ocfs2, and xfs so the Kconfig guard remains in place preventing that user namespaces from being built when any of those filesystems are enabled. Future design work remains to allow root users outside of the initial user namespace to mount more than just /proc and /sys." * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: (38 commits) proc: Usable inode numbers for the namespace file descriptors. proc: Fix the namespace inode permission checks. proc: Generalize proc inode allocation userns: Allow unprivilged mounts of proc and sysfs userns: For /proc/self/{uid,gid}_map derive the lower userns from the struct file procfs: Print task uids and gids in the userns that opened the proc file userns: Implement unshare of the user namespace userns: Implent proc namespace operations userns: Kill task_user_ns userns: Make create_new_namespaces take a user_ns parameter userns: Allow unprivileged use of setns. userns: Allow unprivileged users to create new namespaces userns: Allow setting a userns mapping to your current uid. userns: Allow chown and setgid preservation userns: Allow unprivileged users to create user namespaces. userns: Ignore suid and sgid on binaries if the uid or gid can not be mapped userns: fix return value on mntns_install() failure vfs: Allow unprivileged manipulation of the mount namespace. vfs: Only support slave subtrees across different user namespaces vfs: Add a user namespace reference from struct mnt_namespace ...
446 lines
10 KiB
C
446 lines
10 KiB
C
/*
|
|
* Yama Linux Security Module
|
|
*
|
|
* Author: Kees Cook <keescook@chromium.org>
|
|
*
|
|
* Copyright (C) 2010 Canonical, Ltd.
|
|
* Copyright (C) 2011 The Chromium OS Authors.
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License version 2, as
|
|
* published by the Free Software Foundation.
|
|
*
|
|
*/
|
|
|
|
#include <linux/security.h>
|
|
#include <linux/sysctl.h>
|
|
#include <linux/ptrace.h>
|
|
#include <linux/prctl.h>
|
|
#include <linux/ratelimit.h>
|
|
#include <linux/workqueue.h>
|
|
|
|
#define YAMA_SCOPE_DISABLED 0
|
|
#define YAMA_SCOPE_RELATIONAL 1
|
|
#define YAMA_SCOPE_CAPABILITY 2
|
|
#define YAMA_SCOPE_NO_ATTACH 3
|
|
|
|
static int ptrace_scope = YAMA_SCOPE_RELATIONAL;
|
|
|
|
/* describe a ptrace relationship for potential exception */
|
|
struct ptrace_relation {
|
|
struct task_struct *tracer;
|
|
struct task_struct *tracee;
|
|
bool invalid;
|
|
struct list_head node;
|
|
struct rcu_head rcu;
|
|
};
|
|
|
|
static LIST_HEAD(ptracer_relations);
|
|
static DEFINE_SPINLOCK(ptracer_relations_lock);
|
|
|
|
static void yama_relation_cleanup(struct work_struct *work);
|
|
static DECLARE_WORK(yama_relation_work, yama_relation_cleanup);
|
|
|
|
/**
|
|
* yama_relation_cleanup - remove invalid entries from the relation list
|
|
*
|
|
*/
|
|
static void yama_relation_cleanup(struct work_struct *work)
|
|
{
|
|
struct ptrace_relation *relation;
|
|
|
|
spin_lock(&ptracer_relations_lock);
|
|
rcu_read_lock();
|
|
list_for_each_entry_rcu(relation, &ptracer_relations, node) {
|
|
if (relation->invalid) {
|
|
list_del_rcu(&relation->node);
|
|
kfree_rcu(relation, rcu);
|
|
}
|
|
}
|
|
rcu_read_unlock();
|
|
spin_unlock(&ptracer_relations_lock);
|
|
}
|
|
|
|
/**
|
|
* yama_ptracer_add - add/replace an exception for this tracer/tracee pair
|
|
* @tracer: the task_struct of the process doing the ptrace
|
|
* @tracee: the task_struct of the process to be ptraced
|
|
*
|
|
* Each tracee can have, at most, one tracer registered. Each time this
|
|
* is called, the prior registered tracer will be replaced for the tracee.
|
|
*
|
|
* Returns 0 if relationship was added, -ve on error.
|
|
*/
|
|
static int yama_ptracer_add(struct task_struct *tracer,
|
|
struct task_struct *tracee)
|
|
{
|
|
struct ptrace_relation *relation, *added;
|
|
|
|
added = kmalloc(sizeof(*added), GFP_KERNEL);
|
|
if (!added)
|
|
return -ENOMEM;
|
|
|
|
added->tracee = tracee;
|
|
added->tracer = tracer;
|
|
added->invalid = false;
|
|
|
|
spin_lock(&ptracer_relations_lock);
|
|
rcu_read_lock();
|
|
list_for_each_entry_rcu(relation, &ptracer_relations, node) {
|
|
if (relation->invalid)
|
|
continue;
|
|
if (relation->tracee == tracee) {
|
|
list_replace_rcu(&relation->node, &added->node);
|
|
kfree_rcu(relation, rcu);
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
list_add_rcu(&added->node, &ptracer_relations);
|
|
|
|
out:
|
|
rcu_read_unlock();
|
|
spin_unlock(&ptracer_relations_lock);
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* yama_ptracer_del - remove exceptions related to the given tasks
|
|
* @tracer: remove any relation where tracer task matches
|
|
* @tracee: remove any relation where tracee task matches
|
|
*/
|
|
static void yama_ptracer_del(struct task_struct *tracer,
|
|
struct task_struct *tracee)
|
|
{
|
|
struct ptrace_relation *relation;
|
|
bool marked = false;
|
|
|
|
rcu_read_lock();
|
|
list_for_each_entry_rcu(relation, &ptracer_relations, node) {
|
|
if (relation->invalid)
|
|
continue;
|
|
if (relation->tracee == tracee ||
|
|
(tracer && relation->tracer == tracer)) {
|
|
relation->invalid = true;
|
|
marked = true;
|
|
}
|
|
}
|
|
rcu_read_unlock();
|
|
|
|
if (marked)
|
|
schedule_work(&yama_relation_work);
|
|
}
|
|
|
|
/**
|
|
* yama_task_free - check for task_pid to remove from exception list
|
|
* @task: task being removed
|
|
*/
|
|
void yama_task_free(struct task_struct *task)
|
|
{
|
|
yama_ptracer_del(task, task);
|
|
}
|
|
|
|
/**
|
|
* yama_task_prctl - check for Yama-specific prctl operations
|
|
* @option: operation
|
|
* @arg2: argument
|
|
* @arg3: argument
|
|
* @arg4: argument
|
|
* @arg5: argument
|
|
*
|
|
* Return 0 on success, -ve on error. -ENOSYS is returned when Yama
|
|
* does not handle the given option.
|
|
*/
|
|
int yama_task_prctl(int option, unsigned long arg2, unsigned long arg3,
|
|
unsigned long arg4, unsigned long arg5)
|
|
{
|
|
int rc;
|
|
struct task_struct *myself = current;
|
|
|
|
rc = cap_task_prctl(option, arg2, arg3, arg4, arg5);
|
|
if (rc != -ENOSYS)
|
|
return rc;
|
|
|
|
switch (option) {
|
|
case PR_SET_PTRACER:
|
|
/* Since a thread can call prctl(), find the group leader
|
|
* before calling _add() or _del() on it, since we want
|
|
* process-level granularity of control. The tracer group
|
|
* leader checking is handled later when walking the ancestry
|
|
* at the time of PTRACE_ATTACH check.
|
|
*/
|
|
rcu_read_lock();
|
|
if (!thread_group_leader(myself))
|
|
myself = rcu_dereference(myself->group_leader);
|
|
get_task_struct(myself);
|
|
rcu_read_unlock();
|
|
|
|
if (arg2 == 0) {
|
|
yama_ptracer_del(NULL, myself);
|
|
rc = 0;
|
|
} else if (arg2 == PR_SET_PTRACER_ANY || (int)arg2 == -1) {
|
|
rc = yama_ptracer_add(NULL, myself);
|
|
} else {
|
|
struct task_struct *tracer;
|
|
|
|
rcu_read_lock();
|
|
tracer = find_task_by_vpid(arg2);
|
|
if (tracer)
|
|
get_task_struct(tracer);
|
|
else
|
|
rc = -EINVAL;
|
|
rcu_read_unlock();
|
|
|
|
if (tracer) {
|
|
rc = yama_ptracer_add(tracer, myself);
|
|
put_task_struct(tracer);
|
|
}
|
|
}
|
|
|
|
put_task_struct(myself);
|
|
break;
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
/**
|
|
* task_is_descendant - walk up a process family tree looking for a match
|
|
* @parent: the process to compare against while walking up from child
|
|
* @child: the process to start from while looking upwards for parent
|
|
*
|
|
* Returns 1 if child is a descendant of parent, 0 if not.
|
|
*/
|
|
static int task_is_descendant(struct task_struct *parent,
|
|
struct task_struct *child)
|
|
{
|
|
int rc = 0;
|
|
struct task_struct *walker = child;
|
|
|
|
if (!parent || !child)
|
|
return 0;
|
|
|
|
rcu_read_lock();
|
|
if (!thread_group_leader(parent))
|
|
parent = rcu_dereference(parent->group_leader);
|
|
while (walker->pid > 0) {
|
|
if (!thread_group_leader(walker))
|
|
walker = rcu_dereference(walker->group_leader);
|
|
if (walker == parent) {
|
|
rc = 1;
|
|
break;
|
|
}
|
|
walker = rcu_dereference(walker->real_parent);
|
|
}
|
|
rcu_read_unlock();
|
|
|
|
return rc;
|
|
}
|
|
|
|
/**
|
|
* ptracer_exception_found - tracer registered as exception for this tracee
|
|
* @tracer: the task_struct of the process attempting ptrace
|
|
* @tracee: the task_struct of the process to be ptraced
|
|
*
|
|
* Returns 1 if tracer has is ptracer exception ancestor for tracee.
|
|
*/
|
|
static int ptracer_exception_found(struct task_struct *tracer,
|
|
struct task_struct *tracee)
|
|
{
|
|
int rc = 0;
|
|
struct ptrace_relation *relation;
|
|
struct task_struct *parent = NULL;
|
|
bool found = false;
|
|
|
|
rcu_read_lock();
|
|
if (!thread_group_leader(tracee))
|
|
tracee = rcu_dereference(tracee->group_leader);
|
|
list_for_each_entry_rcu(relation, &ptracer_relations, node) {
|
|
if (relation->invalid)
|
|
continue;
|
|
if (relation->tracee == tracee) {
|
|
parent = relation->tracer;
|
|
found = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (found && (parent == NULL || task_is_descendant(parent, tracer)))
|
|
rc = 1;
|
|
rcu_read_unlock();
|
|
|
|
return rc;
|
|
}
|
|
|
|
/**
|
|
* yama_ptrace_access_check - validate PTRACE_ATTACH calls
|
|
* @child: task that current task is attempting to ptrace
|
|
* @mode: ptrace attach mode
|
|
*
|
|
* Returns 0 if following the ptrace is allowed, -ve on error.
|
|
*/
|
|
int yama_ptrace_access_check(struct task_struct *child,
|
|
unsigned int mode)
|
|
{
|
|
int rc;
|
|
|
|
/* If standard caps disallows it, so does Yama. We should
|
|
* only tighten restrictions further.
|
|
*/
|
|
rc = cap_ptrace_access_check(child, mode);
|
|
if (rc)
|
|
return rc;
|
|
|
|
/* require ptrace target be a child of ptracer on attach */
|
|
if (mode == PTRACE_MODE_ATTACH) {
|
|
switch (ptrace_scope) {
|
|
case YAMA_SCOPE_DISABLED:
|
|
/* No additional restrictions. */
|
|
break;
|
|
case YAMA_SCOPE_RELATIONAL:
|
|
rcu_read_lock();
|
|
if (!task_is_descendant(current, child) &&
|
|
!ptracer_exception_found(current, child) &&
|
|
!ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE))
|
|
rc = -EPERM;
|
|
rcu_read_unlock();
|
|
break;
|
|
case YAMA_SCOPE_CAPABILITY:
|
|
rcu_read_lock();
|
|
if (!ns_capable(__task_cred(child)->user_ns, CAP_SYS_PTRACE))
|
|
rc = -EPERM;
|
|
rcu_read_unlock();
|
|
break;
|
|
case YAMA_SCOPE_NO_ATTACH:
|
|
default:
|
|
rc = -EPERM;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (rc) {
|
|
printk_ratelimited(KERN_NOTICE
|
|
"ptrace of pid %d was attempted by: %s (pid %d)\n",
|
|
child->pid, current->comm, current->pid);
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
/**
|
|
* yama_ptrace_traceme - validate PTRACE_TRACEME calls
|
|
* @parent: task that will become the ptracer of the current task
|
|
*
|
|
* Returns 0 if following the ptrace is allowed, -ve on error.
|
|
*/
|
|
int yama_ptrace_traceme(struct task_struct *parent)
|
|
{
|
|
int rc;
|
|
|
|
/* If standard caps disallows it, so does Yama. We should
|
|
* only tighten restrictions further.
|
|
*/
|
|
rc = cap_ptrace_traceme(parent);
|
|
if (rc)
|
|
return rc;
|
|
|
|
/* Only disallow PTRACE_TRACEME on more aggressive settings. */
|
|
switch (ptrace_scope) {
|
|
case YAMA_SCOPE_CAPABILITY:
|
|
rcu_read_lock();
|
|
if (!ns_capable(__task_cred(parent)->user_ns, CAP_SYS_PTRACE))
|
|
rc = -EPERM;
|
|
rcu_read_unlock();
|
|
break;
|
|
case YAMA_SCOPE_NO_ATTACH:
|
|
rc = -EPERM;
|
|
break;
|
|
}
|
|
|
|
if (rc) {
|
|
printk_ratelimited(KERN_NOTICE
|
|
"ptraceme of pid %d was attempted by: %s (pid %d)\n",
|
|
current->pid, parent->comm, parent->pid);
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
#ifndef CONFIG_SECURITY_YAMA_STACKED
|
|
static struct security_operations yama_ops = {
|
|
.name = "yama",
|
|
|
|
.ptrace_access_check = yama_ptrace_access_check,
|
|
.ptrace_traceme = yama_ptrace_traceme,
|
|
.task_prctl = yama_task_prctl,
|
|
.task_free = yama_task_free,
|
|
};
|
|
#endif
|
|
|
|
#ifdef CONFIG_SYSCTL
|
|
static int yama_dointvec_minmax(struct ctl_table *table, int write,
|
|
void __user *buffer, size_t *lenp, loff_t *ppos)
|
|
{
|
|
int rc;
|
|
|
|
if (write && !capable(CAP_SYS_PTRACE))
|
|
return -EPERM;
|
|
|
|
rc = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
|
|
if (rc)
|
|
return rc;
|
|
|
|
/* Lock the max value if it ever gets set. */
|
|
if (write && *(int *)table->data == *(int *)table->extra2)
|
|
table->extra1 = table->extra2;
|
|
|
|
return rc;
|
|
}
|
|
|
|
static int zero;
|
|
static int max_scope = YAMA_SCOPE_NO_ATTACH;
|
|
|
|
struct ctl_path yama_sysctl_path[] = {
|
|
{ .procname = "kernel", },
|
|
{ .procname = "yama", },
|
|
{ }
|
|
};
|
|
|
|
static struct ctl_table yama_sysctl_table[] = {
|
|
{
|
|
.procname = "ptrace_scope",
|
|
.data = &ptrace_scope,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = yama_dointvec_minmax,
|
|
.extra1 = &zero,
|
|
.extra2 = &max_scope,
|
|
},
|
|
{ }
|
|
};
|
|
#endif /* CONFIG_SYSCTL */
|
|
|
|
static __init int yama_init(void)
|
|
{
|
|
#ifndef CONFIG_SECURITY_YAMA_STACKED
|
|
if (!security_module_enable(&yama_ops))
|
|
return 0;
|
|
#endif
|
|
|
|
printk(KERN_INFO "Yama: becoming mindful.\n");
|
|
|
|
#ifndef CONFIG_SECURITY_YAMA_STACKED
|
|
if (register_security(&yama_ops))
|
|
panic("Yama: kernel registration failed.\n");
|
|
#endif
|
|
|
|
#ifdef CONFIG_SYSCTL
|
|
if (!register_sysctl_paths(yama_sysctl_path, yama_sysctl_table))
|
|
panic("Yama: sysctl registration failed.\n");
|
|
#endif
|
|
|
|
return 0;
|
|
}
|
|
|
|
security_initcall(yama_init);
|