mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2024-11-27 06:04:23 +08:00
532b53cebe
Return -ENOSYS from memfd_secret() syscall if !can_set_direct_map(). This
is the case for example on some arm64 configurations, where marking 4k
PTEs in the direct map not present can only be done if the direct map is
set up at 4k granularity in the first place (as ARM's break-before-make
semantics do not easily allow breaking apart large/gigantic pages).
More precisely, on arm64 systems with !can_set_direct_map(),
set_direct_map_invalid_noflush() is a no-op, however it returns success
(0) instead of an error. This means that memfd_secret will seemingly
"work" (e.g. syscall succeeds, you can mmap the fd and fault in pages),
but it does not actually achieve its goal of removing its memory from the
direct map.
Note that with this patch, memfd_secret() will start erroring on systems
where can_set_direct_map() returns false (arm64 with
CONFIG_RODATA_FULL_DEFAULT_ENABLED=n, CONFIG_DEBUG_PAGEALLOC=n and
CONFIG_KFENCE=n), but that still seems better than the current silent
failure. Since CONFIG_RODATA_FULL_DEFAULT_ENABLED defaults to 'y', most
arm64 systems actually have a working memfd_secret() and aren't be
affected.
From going through the iterations of the original memfd_secret patch
series, it seems that disabling the syscall in these scenarios was the
intended behavior [1] (preferred over having
set_direct_map_invalid_noflush return an error as that would result in
SIGBUSes at page-fault time), however the check for it got dropped between
v16 [2] and v17 [3], when secretmem moved away from CMA allocations.
[1]: https://lore.kernel.org/lkml/20201124164930.GK8537@kernel.org/
[2]: https://lore.kernel.org/lkml/20210121122723.3446-11-rppt@kernel.org/#t
[3]: https://lore.kernel.org/lkml/20201125092208.12544-10-rppt@kernel.org/
Link: https://lkml.kernel.org/r/20241001080056.784735-1-roypat@amazon.co.uk
Fixes: 1507f51255
("mm: introduce memfd_secret system call to create "secret" memory areas")
Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: James Gowans <jgowans@amazon.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
296 lines
6.5 KiB
C
296 lines
6.5 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Copyright IBM Corporation, 2021
|
|
*
|
|
* Author: Mike Rapoport <rppt@linux.ibm.com>
|
|
*/
|
|
|
|
#include <linux/mm.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/swap.h>
|
|
#include <linux/mount.h>
|
|
#include <linux/memfd.h>
|
|
#include <linux/bitops.h>
|
|
#include <linux/printk.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/syscalls.h>
|
|
#include <linux/pseudo_fs.h>
|
|
#include <linux/secretmem.h>
|
|
#include <linux/set_memory.h>
|
|
#include <linux/sched/signal.h>
|
|
|
|
#include <uapi/linux/magic.h>
|
|
|
|
#include <asm/tlbflush.h>
|
|
|
|
#include "internal.h"
|
|
|
|
#undef pr_fmt
|
|
#define pr_fmt(fmt) "secretmem: " fmt
|
|
|
|
/*
|
|
* Define mode and flag masks to allow validation of the system call
|
|
* parameters.
|
|
*/
|
|
#define SECRETMEM_MODE_MASK (0x0)
|
|
#define SECRETMEM_FLAGS_MASK SECRETMEM_MODE_MASK
|
|
|
|
static bool secretmem_enable __ro_after_init = 1;
|
|
module_param_named(enable, secretmem_enable, bool, 0400);
|
|
MODULE_PARM_DESC(secretmem_enable,
|
|
"Enable secretmem and memfd_secret(2) system call");
|
|
|
|
static atomic_t secretmem_users;
|
|
|
|
bool secretmem_active(void)
|
|
{
|
|
return !!atomic_read(&secretmem_users);
|
|
}
|
|
|
|
static vm_fault_t secretmem_fault(struct vm_fault *vmf)
|
|
{
|
|
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
|
|
struct inode *inode = file_inode(vmf->vma->vm_file);
|
|
pgoff_t offset = vmf->pgoff;
|
|
gfp_t gfp = vmf->gfp_mask;
|
|
unsigned long addr;
|
|
struct page *page;
|
|
struct folio *folio;
|
|
vm_fault_t ret;
|
|
int err;
|
|
|
|
if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
|
|
return vmf_error(-EINVAL);
|
|
|
|
filemap_invalidate_lock_shared(mapping);
|
|
|
|
retry:
|
|
page = find_lock_page(mapping, offset);
|
|
if (!page) {
|
|
folio = folio_alloc(gfp | __GFP_ZERO, 0);
|
|
if (!folio) {
|
|
ret = VM_FAULT_OOM;
|
|
goto out;
|
|
}
|
|
|
|
page = &folio->page;
|
|
err = set_direct_map_invalid_noflush(page);
|
|
if (err) {
|
|
folio_put(folio);
|
|
ret = vmf_error(err);
|
|
goto out;
|
|
}
|
|
|
|
__folio_mark_uptodate(folio);
|
|
err = filemap_add_folio(mapping, folio, offset, gfp);
|
|
if (unlikely(err)) {
|
|
folio_put(folio);
|
|
/*
|
|
* If a split of large page was required, it
|
|
* already happened when we marked the page invalid
|
|
* which guarantees that this call won't fail
|
|
*/
|
|
set_direct_map_default_noflush(page);
|
|
if (err == -EEXIST)
|
|
goto retry;
|
|
|
|
ret = vmf_error(err);
|
|
goto out;
|
|
}
|
|
|
|
addr = (unsigned long)page_address(page);
|
|
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
|
|
}
|
|
|
|
vmf->page = page;
|
|
ret = VM_FAULT_LOCKED;
|
|
|
|
out:
|
|
filemap_invalidate_unlock_shared(mapping);
|
|
return ret;
|
|
}
|
|
|
|
static const struct vm_operations_struct secretmem_vm_ops = {
|
|
.fault = secretmem_fault,
|
|
};
|
|
|
|
static int secretmem_release(struct inode *inode, struct file *file)
|
|
{
|
|
atomic_dec(&secretmem_users);
|
|
return 0;
|
|
}
|
|
|
|
static int secretmem_mmap(struct file *file, struct vm_area_struct *vma)
|
|
{
|
|
unsigned long len = vma->vm_end - vma->vm_start;
|
|
|
|
if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
|
|
return -EINVAL;
|
|
|
|
if (!mlock_future_ok(vma->vm_mm, vma->vm_flags | VM_LOCKED, len))
|
|
return -EAGAIN;
|
|
|
|
vm_flags_set(vma, VM_LOCKED | VM_DONTDUMP);
|
|
vma->vm_ops = &secretmem_vm_ops;
|
|
|
|
return 0;
|
|
}
|
|
|
|
bool vma_is_secretmem(struct vm_area_struct *vma)
|
|
{
|
|
return vma->vm_ops == &secretmem_vm_ops;
|
|
}
|
|
|
|
static const struct file_operations secretmem_fops = {
|
|
.release = secretmem_release,
|
|
.mmap = secretmem_mmap,
|
|
};
|
|
|
|
static int secretmem_migrate_folio(struct address_space *mapping,
|
|
struct folio *dst, struct folio *src, enum migrate_mode mode)
|
|
{
|
|
return -EBUSY;
|
|
}
|
|
|
|
static void secretmem_free_folio(struct folio *folio)
|
|
{
|
|
set_direct_map_default_noflush(&folio->page);
|
|
folio_zero_segment(folio, 0, folio_size(folio));
|
|
}
|
|
|
|
const struct address_space_operations secretmem_aops = {
|
|
.dirty_folio = noop_dirty_folio,
|
|
.free_folio = secretmem_free_folio,
|
|
.migrate_folio = secretmem_migrate_folio,
|
|
};
|
|
|
|
static int secretmem_setattr(struct mnt_idmap *idmap,
|
|
struct dentry *dentry, struct iattr *iattr)
|
|
{
|
|
struct inode *inode = d_inode(dentry);
|
|
struct address_space *mapping = inode->i_mapping;
|
|
unsigned int ia_valid = iattr->ia_valid;
|
|
int ret;
|
|
|
|
filemap_invalidate_lock(mapping);
|
|
|
|
if ((ia_valid & ATTR_SIZE) && inode->i_size)
|
|
ret = -EINVAL;
|
|
else
|
|
ret = simple_setattr(idmap, dentry, iattr);
|
|
|
|
filemap_invalidate_unlock(mapping);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static const struct inode_operations secretmem_iops = {
|
|
.setattr = secretmem_setattr,
|
|
};
|
|
|
|
static struct vfsmount *secretmem_mnt;
|
|
|
|
static struct file *secretmem_file_create(unsigned long flags)
|
|
{
|
|
struct file *file;
|
|
struct inode *inode;
|
|
const char *anon_name = "[secretmem]";
|
|
const struct qstr qname = QSTR_INIT(anon_name, strlen(anon_name));
|
|
int err;
|
|
|
|
inode = alloc_anon_inode(secretmem_mnt->mnt_sb);
|
|
if (IS_ERR(inode))
|
|
return ERR_CAST(inode);
|
|
|
|
err = security_inode_init_security_anon(inode, &qname, NULL);
|
|
if (err) {
|
|
file = ERR_PTR(err);
|
|
goto err_free_inode;
|
|
}
|
|
|
|
file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem",
|
|
O_RDWR, &secretmem_fops);
|
|
if (IS_ERR(file))
|
|
goto err_free_inode;
|
|
|
|
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
|
|
mapping_set_unevictable(inode->i_mapping);
|
|
|
|
inode->i_op = &secretmem_iops;
|
|
inode->i_mapping->a_ops = &secretmem_aops;
|
|
|
|
/* pretend we are a normal file with zero size */
|
|
inode->i_mode |= S_IFREG;
|
|
inode->i_size = 0;
|
|
|
|
return file;
|
|
|
|
err_free_inode:
|
|
iput(inode);
|
|
return file;
|
|
}
|
|
|
|
SYSCALL_DEFINE1(memfd_secret, unsigned int, flags)
|
|
{
|
|
struct file *file;
|
|
int fd, err;
|
|
|
|
/* make sure local flags do not confict with global fcntl.h */
|
|
BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC);
|
|
|
|
if (!secretmem_enable || !can_set_direct_map())
|
|
return -ENOSYS;
|
|
|
|
if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC))
|
|
return -EINVAL;
|
|
if (atomic_read(&secretmem_users) < 0)
|
|
return -ENFILE;
|
|
|
|
fd = get_unused_fd_flags(flags & O_CLOEXEC);
|
|
if (fd < 0)
|
|
return fd;
|
|
|
|
file = secretmem_file_create(flags);
|
|
if (IS_ERR(file)) {
|
|
err = PTR_ERR(file);
|
|
goto err_put_fd;
|
|
}
|
|
|
|
file->f_flags |= O_LARGEFILE;
|
|
|
|
atomic_inc(&secretmem_users);
|
|
fd_install(fd, file);
|
|
return fd;
|
|
|
|
err_put_fd:
|
|
put_unused_fd(fd);
|
|
return err;
|
|
}
|
|
|
|
static int secretmem_init_fs_context(struct fs_context *fc)
|
|
{
|
|
return init_pseudo(fc, SECRETMEM_MAGIC) ? 0 : -ENOMEM;
|
|
}
|
|
|
|
static struct file_system_type secretmem_fs = {
|
|
.name = "secretmem",
|
|
.init_fs_context = secretmem_init_fs_context,
|
|
.kill_sb = kill_anon_super,
|
|
};
|
|
|
|
static int __init secretmem_init(void)
|
|
{
|
|
if (!secretmem_enable || !can_set_direct_map())
|
|
return 0;
|
|
|
|
secretmem_mnt = kern_mount(&secretmem_fs);
|
|
if (IS_ERR(secretmem_mnt))
|
|
return PTR_ERR(secretmem_mnt);
|
|
|
|
/* prevent secretmem mappings from ever getting PROT_EXEC */
|
|
secretmem_mnt->mnt_flags |= MNT_NOEXEC;
|
|
|
|
return 0;
|
|
}
|
|
fs_initcall(secretmem_init);
|