mm: implement memory-deny-write-execute as a prctl

Patch series "mm: In-kernel support for memory-deny-write-execute (MDWE)",
v2.

The background to this is that systemd has a configuration option called
MemoryDenyWriteExecute [2], implemented as a SECCOMP BPF filter.  Its aim
is to prevent a user task from inadvertently creating an executable
mapping that is (or was) writeable.  Since such BPF filter is stateless,
it cannot detect mappings that were previously writeable but subsequently
changed to read-only.  Therefore the filter simply rejects any
mprotect(PROT_EXEC).  The side-effect is that on arm64 with BTI support
(Branch Target Identification), the dynamic loader cannot change an ELF
section from PROT_EXEC to PROT_EXEC|PROT_BTI using mprotect().  For
libraries, it can resort to unmapping and re-mapping but for the main
executable it does not have a file descriptor.  The original bug report in
the Red Hat bugzilla - [3] - and subsequent glibc workaround for libraries
- [4].

This series adds in-kernel support for this feature as a prctl
PR_SET_MDWE, that is inherited on fork().  The prctl denies PROT_WRITE |
PROT_EXEC mappings.  Like the systemd BPF filter it also denies adding
PROT_EXEC to mappings.  However unlike the BPF filter it only denies it if
the mapping didn't previous have PROT_EXEC.  This allows to PROT_EXEC ->
PROT_EXEC | PROT_BTI with mprotect(), which is a problem with the BPF
filter.


This patch (of 2):

The aim of such policy is to prevent a user task from creating an
executable mapping that is also writeable.

An example of mmap() returning -EACCESS if the policy is enabled:

	mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, flags, 0, 0);

Similarly, mprotect() would return -EACCESS below:

	addr = mmap(0, size, PROT_READ | PROT_EXEC, flags, 0, 0);
	mprotect(addr, size, PROT_READ | PROT_WRITE | PROT_EXEC);

The BPF filter that systemd MDWE uses is stateless, and disallows
mprotect() with PROT_EXEC completely. This new prctl allows PROT_EXEC to
be enabled if it was already PROT_EXEC, which allows the following case:

	addr = mmap(0, size, PROT_READ | PROT_EXEC, flags, 0, 0);
	mprotect(addr, size, PROT_READ | PROT_EXEC | PROT_BTI);

where PROT_BTI enables branch tracking identification on arm64.

Link: https://lkml.kernel.org/r/20230119160344.54358-1-joey.gouly@arm.com
Link: https://lkml.kernel.org/r/20230119160344.54358-2-joey.gouly@arm.com
Signed-off-by: Joey Gouly <joey.gouly@arm.com>
Co-developed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Jeremy Linton <jeremy.linton@arm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Mark Brown <broonie@kernel.org>
Cc: nd <nd@arm.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Szabolcs Nagy <szabolcs.nagy@arm.com>
Cc: Topi Miettinen <toiwoton@gmail.com>
Cc: Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
Joey Gouly 2023-01-19 16:03:43 +00:00 committed by Andrew Morton
parent e6d2c436ff
commit b507808ebc
6 changed files with 93 additions and 1 deletions

View File

@ -156,4 +156,38 @@ calc_vm_flag_bits(unsigned long flags)
} }
unsigned long vm_commit_limit(void); unsigned long vm_commit_limit(void);
/*
* Denies creating a writable executable mapping or gaining executable permissions.
*
* This denies the following:
*
* a) mmap(PROT_WRITE | PROT_EXEC)
*
* b) mmap(PROT_WRITE)
* mprotect(PROT_EXEC)
*
* c) mmap(PROT_WRITE)
* mprotect(PROT_READ)
* mprotect(PROT_EXEC)
*
* But allows the following:
*
* d) mmap(PROT_READ | PROT_EXEC)
* mmap(PROT_READ | PROT_EXEC | PROT_BTI)
*/
static inline bool map_deny_write_exec(struct vm_area_struct *vma, unsigned long vm_flags)
{
if (!test_bit(MMF_HAS_MDWE, &current->mm->flags))
return false;
if ((vm_flags & VM_EXEC) && (vm_flags & VM_WRITE))
return true;
if (!(vma->vm_flags & VM_EXEC) && (vm_flags & VM_EXEC))
return true;
return false;
}
#endif /* _LINUX_MMAN_H */ #endif /* _LINUX_MMAN_H */

View File

@ -81,9 +81,13 @@ static inline int get_dumpable(struct mm_struct *mm)
* lifecycle of this mm, just for simplicity. * lifecycle of this mm, just for simplicity.
*/ */
#define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */ #define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */
#define MMF_HAS_MDWE 28
#define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE)
#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP)
#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
MMF_DISABLE_THP_MASK) MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK)
#endif /* _LINUX_SCHED_COREDUMP_H */ #endif /* _LINUX_SCHED_COREDUMP_H */

View File

@ -281,6 +281,12 @@ struct prctl_mm_map {
# define PR_SME_VL_LEN_MASK 0xffff # define PR_SME_VL_LEN_MASK 0xffff
# define PR_SME_VL_INHERIT (1 << 17) /* inherit across exec */ # define PR_SME_VL_INHERIT (1 << 17) /* inherit across exec */
/* Memory deny write / execute */
#define PR_SET_MDWE 65
# define PR_MDWE_REFUSE_EXEC_GAIN 1
#define PR_GET_MDWE 66
#define PR_SET_VMA 0x53564d41 #define PR_SET_VMA 0x53564d41
# define PR_SET_VMA_ANON_NAME 0 # define PR_SET_VMA_ANON_NAME 0

View File

@ -2348,6 +2348,33 @@ static int prctl_set_vma(unsigned long opt, unsigned long start,
} }
#endif /* CONFIG_ANON_VMA_NAME */ #endif /* CONFIG_ANON_VMA_NAME */
static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3,
unsigned long arg4, unsigned long arg5)
{
if (arg3 || arg4 || arg5)
return -EINVAL;
if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN))
return -EINVAL;
if (bits & PR_MDWE_REFUSE_EXEC_GAIN)
set_bit(MMF_HAS_MDWE, &current->mm->flags);
else if (test_bit(MMF_HAS_MDWE, &current->mm->flags))
return -EPERM; /* Cannot unset the flag */
return 0;
}
static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3,
unsigned long arg4, unsigned long arg5)
{
if (arg2 || arg3 || arg4 || arg5)
return -EINVAL;
return test_bit(MMF_HAS_MDWE, &current->mm->flags) ?
PR_MDWE_REFUSE_EXEC_GAIN : 0;
}
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5) unsigned long, arg4, unsigned long, arg5)
{ {
@ -2623,6 +2650,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = sched_core_share_pid(arg2, arg3, arg4, arg5); error = sched_core_share_pid(arg2, arg3, arg4, arg5);
break; break;
#endif #endif
case PR_SET_MDWE:
error = prctl_set_mdwe(arg2, arg3, arg4, arg5);
break;
case PR_GET_MDWE:
error = prctl_get_mdwe(arg2, arg3, arg4, arg5);
break;
case PR_SET_VMA: case PR_SET_VMA:
error = prctl_set_vma(arg2, arg3, arg4, arg5); error = prctl_set_vma(arg2, arg3, arg4, arg5);
break; break;

View File

@ -2669,6 +2669,16 @@ cannot_expand:
vma_set_anonymous(vma); vma_set_anonymous(vma);
} }
if (map_deny_write_exec(vma, vma->vm_flags)) {
error = -EACCES;
if (file)
goto close_and_free_vma;
else if (vma->vm_file)
goto unmap_and_free_vma;
else
goto free_vma;
}
/* Allow architectures to sanity-check the vm_flags */ /* Allow architectures to sanity-check the vm_flags */
if (!arch_validate_flags(vma->vm_flags)) { if (!arch_validate_flags(vma->vm_flags)) {
error = -EINVAL; error = -EINVAL;

View File

@ -799,6 +799,11 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
break; break;
} }
if (map_deny_write_exec(vma, newflags)) {
error = -EACCES;
goto out;
}
/* Allow architectures to sanity-check the new flags */ /* Allow architectures to sanity-check the new flags */
if (!arch_validate_flags(newflags)) { if (!arch_validate_flags(newflags)) {
error = -EINVAL; error = -EINVAL;