From 90f8572b0f021fdd1baa68e00a8c30482ee9e5f4 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 29 Jun 2015 14:42:03 -0500
Subject: [PATCH 1/6] vfs: Commit to never having exectuables on proc and
 sysfs.

Today proc and sysfs do not contain any executable files.  Several
applications today mount proc or sysfs without noexec and nosuid and
then depend on there being no exectuables files on proc or sysfs.
Having any executable files show on proc or sysfs would cause
a user space visible regression, and most likely security problems.

Therefore commit to never allowing executables on proc and sysfs by
adding a new flag to mark them as filesystems without executables and
enforce that flag.

Test the flag where MNT_NOEXEC is tested today, so that the only user
visible effect will be that exectuables will be treated as if the
execute bit is cleared.

The filesystems proc and sysfs do not currently incoporate any
executable files so this does not result in any user visible effects.

This makes it unnecessary to vet changes to proc and sysfs tightly for
adding exectuable files or changes to chattr that would modify
existing files, as no matter what the individual file say they will
not be treated as exectuable files by the vfs.

Not having to vet changes to closely is important as without this we
are only one proc_create call (or another goof up in the
implementation of notify_change) from having problematic executables
on proc.  Those mistakes are all too easy to make and would create
a situation where there are security issues or the assumptions of
some program having to be broken (and cause userspace regressions).

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/exec.c           | 10 ++++++++--
 fs/open.c           |  2 +-
 fs/proc/root.c      |  2 ++
 fs/sysfs/mount.c    |  4 ++++
 include/linux/fs.h  |  3 +++
 kernel/sys.c        |  3 +--
 mm/mmap.c           |  4 ++--
 mm/nommu.c          |  2 +-
 security/security.c |  2 +-
 9 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 1977c2a553ac..b06623a9347f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -98,6 +98,12 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
 	module_put(fmt->module);
 }
 
+bool path_noexec(const struct path *path)
+{
+	return (path->mnt->mnt_flags & MNT_NOEXEC) ||
+	       (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
+}
+
 #ifdef CONFIG_USELIB
 /*
  * Note that a shared library must be both readable and executable due to
@@ -132,7 +138,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
 		goto exit;
 
 	error = -EACCES;
-	if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
+	if (path_noexec(&file->f_path))
 		goto exit;
 
 	fsnotify_open(file);
@@ -777,7 +783,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
 	if (!S_ISREG(file_inode(file)->i_mode))
 		goto exit;
 
-	if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
+	if (path_noexec(&file->f_path))
 		goto exit;
 
 	err = deny_write_access(file);
diff --git a/fs/open.c b/fs/open.c
index e33dab287fa0..b6f1e96a7c0b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -377,7 +377,7 @@ retry:
 		 * with the "noexec" flag.
 		 */
 		res = -EACCES;
-		if (path.mnt->mnt_flags & MNT_NOEXEC)
+		if (path_noexec(&path))
 			goto out_path_release;
 	}
 
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 68feb0f70e63..361ab4ee42fc 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -134,6 +134,8 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 		}
 
 		sb->s_flags |= MS_ACTIVE;
+		/* User space would break if executables appear on proc */
+		sb->s_iflags |= SB_I_NOEXEC;
 	}
 
 	return dget(sb->s_root);
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 1c6ac6fcee9f..f3db82071cfb 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -40,6 +40,10 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 				SYSFS_MAGIC, &new_sb, ns);
 	if (IS_ERR(root) || !new_sb)
 		kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
+	else if (new_sb)
+		/* Userspace would break if executables appear on sysfs */
+		root->d_sb->s_iflags |= SB_I_NOEXEC;
+
 	return root;
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a0653e560c26..42912f8d286e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1244,6 +1244,7 @@ struct mm_struct;
 
 /* sb->s_iflags */
 #define SB_I_CGROUPWB	0x00000001	/* cgroup-aware writeback enabled */
+#define SB_I_NOEXEC	0x00000002	/* Ignore executables on this fs */
 
 /* Possible states of 'frozen' field */
 enum {
@@ -3030,4 +3031,6 @@ static inline bool dir_relax(struct inode *inode)
 	return !IS_DEADDIR(inode);
 }
 
+extern bool path_noexec(const struct path *path);
+
 #endif /* _LINUX_FS_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 259fda25eb6b..fa2f2f671a5c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1668,8 +1668,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 	 * overall picture.
 	 */
 	err = -EACCES;
-	if (!S_ISREG(inode->i_mode)	||
-	    exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)
+	if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
 		goto exit;
 
 	err = inode_permission(inode, MAY_EXEC);
diff --git a/mm/mmap.c b/mm/mmap.c
index aa632ade2be7..f126923ce683 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1268,7 +1268,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 	 *  mounted, in which case we dont add PROT_EXEC.)
 	 */
 	if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
-		if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
+		if (!(file && path_noexec(&file->f_path)))
 			prot |= PROT_EXEC;
 
 	if (!(flags & MAP_FIXED))
@@ -1337,7 +1337,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 		case MAP_PRIVATE:
 			if (!(file->f_mode & FMODE_READ))
 				return -EACCES;
-			if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
+			if (path_noexec(&file->f_path)) {
 				if (vm_flags & VM_EXEC)
 					return -EPERM;
 				vm_flags &= ~VM_MAYEXEC;
diff --git a/mm/nommu.c b/mm/nommu.c
index 58ea3643b9e9..ce17abf087ff 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1035,7 +1035,7 @@ static int validate_mmap_request(struct file *file,
 
 		/* handle executable mappings and implied executable
 		 * mappings */
-		if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
+		if (path_noexec(&file->f_path)) {
 			if (prot & PROT_EXEC)
 				return -EPERM;
 		} else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
diff --git a/security/security.c b/security/security.c
index 595fffab48b0..062f3c997fdc 100644
--- a/security/security.c
+++ b/security/security.c
@@ -776,7 +776,7 @@ static inline unsigned long mmap_prot(struct file *file, unsigned long prot)
 	 * ditto if it's not on noexec mount, except that on !MMU we need
 	 * NOMMU_MAP_EXEC (== VM_MAYEXEC) in this case
 	 */
-	if (!(file->f_path.mnt->mnt_flags & MNT_NOEXEC)) {
+	if (!path_noexec(&file->f_path)) {
 #ifndef CONFIG_MMU
 		if (file->f_op->mmap_capabilities) {
 			unsigned caps = file->f_op->mmap_capabilities(file);

From 77b1a97d218277d55a15016826d1fd79290f1df2 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 4 Jun 2015 09:43:11 -0500
Subject: [PATCH 2/6] mnt: fs_fully_visible enforce noexec and nosuid  if
 !SB_I_NOEXEC

The filesystems proc and sysfs do not have executable files do not
have exectuable files today and portions of userspace break if we do
enforce nosuid and noexec consistency of nosuid and noexec flags
between previous mounts and new mounts of proc and sysfs.

Add the code to enforce consistency of the nosuid and noexec flags,
and use the presence of SB_I_NOEXEC to signal that there is no need to
bother.

This results in a completely userspace invisible change that makes it
clear fs_fully_visible can only skip the enforcement of noexec and
nosuid because it is known the filesystems in question do not support
executables.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/namespace.c | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index c7cb8a526c05..ce428cadd41f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3194,6 +3194,8 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags)
 	down_read(&namespace_sem);
 	list_for_each_entry(mnt, &ns->list, mnt_list) {
 		struct mount *child;
+		int mnt_flags;
+
 		if (mnt->mnt.mnt_sb->s_type != type)
 			continue;
 
@@ -3203,17 +3205,30 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags)
 		if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
 			continue;
 
+		/* Read the mount flags and filter out flags that
+		 * may safely be ignored.
+		 */
+		mnt_flags = mnt->mnt.mnt_flags;
+		if (mnt->mnt.mnt_sb->s_iflags & SB_I_NOEXEC)
+			mnt_flags &= ~(MNT_LOCK_NOSUID | MNT_LOCK_NOEXEC);
+
 		/* Verify the mount flags are equal to or more permissive
 		 * than the proposed new mount.
 		 */
-		if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&
+		if ((mnt_flags & MNT_LOCK_READONLY) &&
 		    !(new_flags & MNT_READONLY))
 			continue;
-		if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
+		if ((mnt_flags & MNT_LOCK_NODEV) &&
 		    !(new_flags & MNT_NODEV))
 			continue;
-		if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&
-		    ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
+		if ((mnt_flags & MNT_LOCK_NOSUID) &&
+		    !(new_flags & MNT_NOSUID))
+			continue;
+		if ((mnt_flags & MNT_LOCK_NOEXEC) &&
+		    !(new_flags & MNT_NOEXEC))
+			continue;
+		if ((mnt_flags & MNT_LOCK_ATIME) &&
+		    ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
 			continue;
 
 		/* This mount is not fully visible if there are any
@@ -3223,16 +3238,18 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags)
 		list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
 			struct inode *inode = child->mnt_mountpoint->d_inode;
 			/* Only worry about locked mounts */
-			if (!(mnt->mnt.mnt_flags & MNT_LOCKED))
+			if (!(mnt_flags & MNT_LOCKED))
 				continue;
 			/* Is the directory permanetly empty? */
 			if (!is_empty_dir_inode(inode))
 				goto next;
 		}
 		/* Preserve the locked attributes */
-		*new_mnt_flags |= mnt->mnt.mnt_flags & (MNT_LOCK_READONLY | \
-							MNT_LOCK_NODEV    | \
-							MNT_LOCK_ATIME);
+		*new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
+					       MNT_LOCK_NODEV    | \
+					       MNT_LOCK_NOSUID   | \
+					       MNT_LOCK_NOEXEC   | \
+					       MNT_LOCK_ATIME);
 		visible = true;
 		goto found;
 	next:	;

From 75509fd88fbd580c793780b0001c71c3510f2726 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 24 May 2015 12:49:04 -0500
Subject: [PATCH 3/6] nsfs: Add a show_path method to fix mountinfo

Today mountinfo displays a very unhelpful "/" for nsfs files.  Add a
show_path method returning the same string as ns_dname.  This results
in a bind mount of /proc/<pid>/ns/net showing up in /proc/<pid>/mountinfo as
"net:[1234...]" instead of "/".

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/nsfs.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/fs/nsfs.c b/fs/nsfs.c
index 99521e7c492b..e4905fbf3396 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -4,6 +4,7 @@
 #include <linux/proc_ns.h>
 #include <linux/magic.h>
 #include <linux/ktime.h>
+#include <linux/seq_file.h>
 
 static struct vfsmount *nsfs_mnt;
 
@@ -136,9 +137,18 @@ out_invalid:
 	return ERR_PTR(-EINVAL);
 }
 
+static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+	const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
+
+	return seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino);
+}
+
 static const struct super_operations nsfs_ops = {
 	.statfs = simple_statfs,
 	.evict_inode = nsfs_evict,
+	.show_path = nsfs_show_path,
 };
 static struct dentry *nsfs_mount(struct file_system_type *fs_type,
 			int flags, const char *dev_name, void *data)

From 12c641ab8270f787dfcce08b5f20ce8b65008096 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 10 Aug 2015 17:35:07 -0500
Subject: [PATCH 4/6] unshare: Unsharing a thread does not require unsharing a
 vm

In the logic in the initial commit of unshare made creating a new
thread group for a process, contingent upon creating a new memory
address space for that process.  That is wrong.  Two separate
processes in different thread groups can share a memory address space
and clone allows creation of such proceses.

This is significant because it was observed that mm_users > 1 does not
mean that a process is multi-threaded, as reading /proc/PID/maps
temporarily increments mm_users, which allows other processes to
(accidentally) interfere with unshare() calls.

Correct the check in check_unshare_flags() to test for
!thread_group_empty() for CLONE_THREAD, CLONE_SIGHAND, and CLONE_VM.
For sighand->count > 1 for CLONE_SIGHAND and CLONE_VM.
For !current_is_single_threaded instead of mm_users > 1 for CLONE_VM.

By using the correct checks in unshare this removes the possibility of
an accidental denial of service attack.

Additionally using the correct checks in unshare ensures that only an
explicit unshare(CLONE_VM) can possibly trigger the slow path of
current_is_single_threaded().  As an explict unshare(CLONE_VM) is
pointless it is not expected there are many applications that make
that call.

Cc: stable@vger.kernel.org
Fixes: b2e0d98705e60e45bbb3c0032c48824ad7ae0704 userns: Implement unshare of the user namespace
Reported-by: Ricky Zhou <rickyz@chromium.org>
Reported-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/fork.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 1bfefc6f96a4..d544ae97f999 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1866,13 +1866,21 @@ static int check_unshare_flags(unsigned long unshare_flags)
 				CLONE_NEWUSER|CLONE_NEWPID))
 		return -EINVAL;
 	/*
-	 * Not implemented, but pretend it works if there is nothing to
-	 * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND
-	 * needs to unshare vm.
+	 * Not implemented, but pretend it works if there is nothing
+	 * to unshare.  Note that unsharing the address space or the
+	 * signal handlers also need to unshare the signal queues (aka
+	 * CLONE_THREAD).
 	 */
 	if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
-		/* FIXME: get_task_mm() increments ->mm_users */
-		if (atomic_read(&current->mm->mm_users) > 1)
+		if (!thread_group_empty(current))
+			return -EINVAL;
+	}
+	if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
+		if (atomic_read(&current->sighand->count) > 1)
+			return -EINVAL;
+	}
+	if (unshare_flags & CLONE_VM) {
+		if (!current_is_single_threaded())
 			return -EINVAL;
 	}
 
@@ -1940,16 +1948,16 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 	 */
 	if (unshare_flags & CLONE_NEWUSER)
 		unshare_flags |= CLONE_THREAD | CLONE_FS;
-	/*
-	 * If unsharing a thread from a thread group, must also unshare vm.
-	 */
-	if (unshare_flags & CLONE_THREAD)
-		unshare_flags |= CLONE_VM;
 	/*
 	 * If unsharing vm, must also unshare signal handlers.
 	 */
 	if (unshare_flags & CLONE_VM)
 		unshare_flags |= CLONE_SIGHAND;
+	/*
+	 * If unsharing a signal handlers, must also unshare the signal queues.
+	 */
+	if (unshare_flags & CLONE_SIGHAND)
+		unshare_flags |= CLONE_THREAD;
 	/*
 	 * If unsharing namespace, must also unshare filesystem information.
 	 */

From faf00da544045fdc1454f3b9e6d7f65c841de302 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 10 Aug 2015 18:25:44 -0500
Subject: [PATCH 5/6] userns,pidns: Force thread group sharing, not signal
 handler sharing.

The code that places signals in signal queues computes the uids, gids,
and pids at the time the signals are enqueued.  Which means that tasks
that share signal queues must be in the same pid and user namespaces.

Sharing signal handlers is fine, but bizarre.

So make the code in fork and userns_install clearer by only testing
for what is functionally necessary.

Also update the comment in unshare about unsharing a user namespace to
be a little more explicit and make a little more sense.

Acked-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/fork.c           | 8 ++++----
 kernel/user_namespace.c | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index d544ae97f999..2c72b8a8ae24 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1273,10 +1273,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	/*
 	 * If the new process will be in a different pid or user namespace
-	 * do not allow it to share a thread group or signal handlers or
-	 * parent with the forking task.
+	 * do not allow it to share a thread group with the forking task.
 	 */
-	if (clone_flags & CLONE_SIGHAND) {
+	if (clone_flags & CLONE_THREAD) {
 		if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
 		    (task_active_pid_ns(current) !=
 				current->nsproxy->pid_ns_for_children))
@@ -1944,7 +1943,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 	int err;
 
 	/*
-	 * If unsharing a user namespace must also unshare the thread.
+	 * If unsharing a user namespace must also unshare the thread group
+	 * and unshare the filesystem root and working directories.
 	 */
 	if (unshare_flags & CLONE_NEWUSER)
 		unshare_flags |= CLONE_THREAD | CLONE_FS;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 4109f8320684..f65a0a06a8c0 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -976,8 +976,8 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
 	if (user_ns == current_user_ns())
 		return -EINVAL;
 
-	/* Threaded processes may not enter a different user namespace */
-	if (atomic_read(&current->mm->mm_users) > 1)
+	/* Tasks that share a thread group must share a user namespace */
+	if (!thread_group_empty(current))
 		return -EINVAL;
 
 	if (current->fs->users != 1)

From 4b75de8615050c1b0dd8d7794838c42f74ed36ba Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 12 Aug 2015 15:00:12 -0500
Subject: [PATCH 6/6] fs: Set the size of empty dirs to 0.

Before the make_empty_dir_inode calls were introduce into proc, sysfs,
and sysctl those directories when stated reported an i_size of 0.
make_empty_dir_inode started reporting an i_size of 2.  At least one
userspace application depended on stat returning i_size of 0.  So
modify make_empty_dir_inode to cause an i_size of 0 to be reported for
these directories.

Cc: stable@vger.kernel.org
Reported-by: Tejun Heo <tj@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/libfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/libfs.c b/fs/libfs.c
index 102edfd39000..c7cbfb092e94 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1185,7 +1185,7 @@ void make_empty_dir_inode(struct inode *inode)
 	inode->i_uid = GLOBAL_ROOT_UID;
 	inode->i_gid = GLOBAL_ROOT_GID;
 	inode->i_rdev = 0;
-	inode->i_size = 2;
+	inode->i_size = 0;
 	inode->i_blkbits = PAGE_SHIFT;
 	inode->i_blocks = 0;