linux/mm/secretmem.c
Axel Rasmussen f9b141f936 mm/secretmem: fix panic when growing a memfd_secret
When one tries to grow an existing memfd_secret with ftruncate, one gets
a panic [1].  For example, doing the following reliably induces the
panic:

    fd = memfd_secret();

    ftruncate(fd, 10);
    ptr = mmap(NULL, 10, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
    strcpy(ptr, "123456789");

    munmap(ptr, 10);
    ftruncate(fd, 20);

The basic reason for this is, when we grow with ftruncate, we call down
into simple_setattr, and then truncate_inode_pages_range, and eventually
we try to zero part of the memory.  The normal truncation code does this
via the direct map (i.e., it calls page_address() and hands that to
memset()).

For memfd_secret though, we specifically don't map our pages via the
direct map (i.e.  we call set_direct_map_invalid_noflush() on every
fault).  So the address returned by page_address() isn't useful, and
when we try to memset() with it we panic.

This patch avoids the panic by implementing a custom setattr for
memfd_secret, which detects resizes specifically (setting the size for
the first time works just fine, since there are no existing pages to try
to zero), and rejects them with EINVAL.

One could argue growing should be supported, but I think that will
require a significantly more lengthy change.  So, I propose a minimal
fix for the benefit of stable kernels, and then perhaps to extend
memfd_secret to support growing in a separate patch.

[1]:

  BUG: unable to handle page fault for address: ffffa0a889277028
  #PF: supervisor write access in kernel mode
  #PF: error_code(0x0002) - not-present page
  PGD afa01067 P4D afa01067 PUD 83f909067 PMD 83f8bf067 PTE 800ffffef6d88060
  Oops: 0002 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI
  CPU: 0 PID: 281 Comm: repro Not tainted 5.17.0-dbg-DEV #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
  RIP: 0010:memset_erms+0x9/0x10
  Code: c1 e9 03 40 0f b6 f6 48 b8 01 01 01 01 01 01 01 01 48 0f af c6 f3 48 ab 89 d1 f3 aa 4c 89 c8 c3 90 49 89 f9 40 88 f0 48 89 d1 <f3> aa 4c 89 c8 c3 90 49 89 fa 40 0f b6 ce 48 b8 01 01 01 01 01 01
  RSP: 0018:ffffb932c09afbf0 EFLAGS: 00010246
  RAX: 0000000000000000 RBX: ffffda63c4249dc0 RCX: 0000000000000fd8
  RDX: 0000000000000fd8 RSI: 0000000000000000 RDI: ffffa0a889277028
  RBP: ffffb932c09afc00 R08: 0000000000001000 R09: ffffa0a889277028
  R10: 0000000000020023 R11: 0000000000000000 R12: ffffda63c4249dc0
  R13: ffffa0a890d70d98 R14: 0000000000000028 R15: 0000000000000fd8
  FS:  00007f7294899580(0000) GS:ffffa0af9bc00000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: ffffa0a889277028 CR3: 0000000107ef6006 CR4: 0000000000370ef0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
   ? zero_user_segments+0x82/0x190
   truncate_inode_partial_folio+0xd4/0x2a0
   truncate_inode_pages_range+0x380/0x830
   truncate_setsize+0x63/0x80
   simple_setattr+0x37/0x60
   notify_change+0x3d8/0x4d0
   do_sys_ftruncate+0x162/0x1d0
   __x64_sys_ftruncate+0x1c/0x20
   do_syscall_64+0x44/0xa0
   entry_SYSCALL_64_after_hwframe+0x44/0xae
  Modules linked in: xhci_pci xhci_hcd virtio_net net_failover failover virtio_blk virtio_balloon uhci_hcd ohci_pci ohci_hcd evdev ehci_pci ehci_hcd 9pnet_virtio 9p netfs 9pnet
  CR2: ffffa0a889277028

[lkp@intel.com: secretmem_iops can be static]
  Signed-off-by: kernel test robot <lkp@intel.com>
[axelrasmussen@google.com: return EINVAL]

Link: https://lkml.kernel.org/r/20220324210909.1843814-1-axelrasmussen@google.com
Link: https://lkml.kernel.org/r/20220412193023.279320-1-axelrasmussen@google.com
Signed-off-by: Axel Rasmussen <axelrasmussen@google.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: kernel test robot <lkp@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-04-15 14:49:54 -07:00

275 lines
6.0 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright IBM Corporation, 2021
*
* Author: Mike Rapoport <rppt@linux.ibm.com>
*/
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/swap.h>
#include <linux/mount.h>
#include <linux/memfd.h>
#include <linux/bitops.h>
#include <linux/printk.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/pseudo_fs.h>
#include <linux/secretmem.h>
#include <linux/set_memory.h>
#include <linux/sched/signal.h>
#include <uapi/linux/magic.h>
#include <asm/tlbflush.h>
#include "internal.h"
#undef pr_fmt
#define pr_fmt(fmt) "secretmem: " fmt
/*
* Define mode and flag masks to allow validation of the system call
* parameters.
*/
#define SECRETMEM_MODE_MASK (0x0)
#define SECRETMEM_FLAGS_MASK SECRETMEM_MODE_MASK
static bool secretmem_enable __ro_after_init;
module_param_named(enable, secretmem_enable, bool, 0400);
MODULE_PARM_DESC(secretmem_enable,
"Enable secretmem and memfd_secret(2) system call");
static atomic_t secretmem_users;
bool secretmem_active(void)
{
return !!atomic_read(&secretmem_users);
}
static vm_fault_t secretmem_fault(struct vm_fault *vmf)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
struct inode *inode = file_inode(vmf->vma->vm_file);
pgoff_t offset = vmf->pgoff;
gfp_t gfp = vmf->gfp_mask;
unsigned long addr;
struct page *page;
int err;
if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
return vmf_error(-EINVAL);
retry:
page = find_lock_page(mapping, offset);
if (!page) {
page = alloc_page(gfp | __GFP_ZERO);
if (!page)
return VM_FAULT_OOM;
err = set_direct_map_invalid_noflush(page);
if (err) {
put_page(page);
return vmf_error(err);
}
__SetPageUptodate(page);
err = add_to_page_cache_lru(page, mapping, offset, gfp);
if (unlikely(err)) {
put_page(page);
/*
* If a split of large page was required, it
* already happened when we marked the page invalid
* which guarantees that this call won't fail
*/
set_direct_map_default_noflush(page);
if (err == -EEXIST)
goto retry;
return vmf_error(err);
}
addr = (unsigned long)page_address(page);
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
}
vmf->page = page;
return VM_FAULT_LOCKED;
}
static const struct vm_operations_struct secretmem_vm_ops = {
.fault = secretmem_fault,
};
static int secretmem_release(struct inode *inode, struct file *file)
{
atomic_dec(&secretmem_users);
return 0;
}
static int secretmem_mmap(struct file *file, struct vm_area_struct *vma)
{
unsigned long len = vma->vm_end - vma->vm_start;
if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
return -EINVAL;
if (mlock_future_check(vma->vm_mm, vma->vm_flags | VM_LOCKED, len))
return -EAGAIN;
vma->vm_flags |= VM_LOCKED | VM_DONTDUMP;
vma->vm_ops = &secretmem_vm_ops;
return 0;
}
bool vma_is_secretmem(struct vm_area_struct *vma)
{
return vma->vm_ops == &secretmem_vm_ops;
}
static const struct file_operations secretmem_fops = {
.release = secretmem_release,
.mmap = secretmem_mmap,
};
static bool secretmem_isolate_page(struct page *page, isolate_mode_t mode)
{
return false;
}
static int secretmem_migratepage(struct address_space *mapping,
struct page *newpage, struct page *page,
enum migrate_mode mode)
{
return -EBUSY;
}
static void secretmem_freepage(struct page *page)
{
set_direct_map_default_noflush(page);
clear_highpage(page);
}
const struct address_space_operations secretmem_aops = {
.dirty_folio = noop_dirty_folio,
.freepage = secretmem_freepage,
.migratepage = secretmem_migratepage,
.isolate_page = secretmem_isolate_page,
};
static int secretmem_setattr(struct user_namespace *mnt_userns,
struct dentry *dentry, struct iattr *iattr)
{
struct inode *inode = d_inode(dentry);
unsigned int ia_valid = iattr->ia_valid;
if ((ia_valid & ATTR_SIZE) && inode->i_size)
return -EINVAL;
return simple_setattr(mnt_userns, dentry, iattr);
}
static const struct inode_operations secretmem_iops = {
.setattr = secretmem_setattr,
};
static struct vfsmount *secretmem_mnt;
static struct file *secretmem_file_create(unsigned long flags)
{
struct file *file = ERR_PTR(-ENOMEM);
struct inode *inode;
inode = alloc_anon_inode(secretmem_mnt->mnt_sb);
if (IS_ERR(inode))
return ERR_CAST(inode);
file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem",
O_RDWR, &secretmem_fops);
if (IS_ERR(file))
goto err_free_inode;
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
mapping_set_unevictable(inode->i_mapping);
inode->i_op = &secretmem_iops;
inode->i_mapping->a_ops = &secretmem_aops;
/* pretend we are a normal file with zero size */
inode->i_mode |= S_IFREG;
inode->i_size = 0;
return file;
err_free_inode:
iput(inode);
return file;
}
SYSCALL_DEFINE1(memfd_secret, unsigned int, flags)
{
struct file *file;
int fd, err;
/* make sure local flags do not confict with global fcntl.h */
BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC);
if (!secretmem_enable)
return -ENOSYS;
if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC))
return -EINVAL;
if (atomic_read(&secretmem_users) < 0)
return -ENFILE;
fd = get_unused_fd_flags(flags & O_CLOEXEC);
if (fd < 0)
return fd;
file = secretmem_file_create(flags);
if (IS_ERR(file)) {
err = PTR_ERR(file);
goto err_put_fd;
}
file->f_flags |= O_LARGEFILE;
atomic_inc(&secretmem_users);
fd_install(fd, file);
return fd;
err_put_fd:
put_unused_fd(fd);
return err;
}
static int secretmem_init_fs_context(struct fs_context *fc)
{
return init_pseudo(fc, SECRETMEM_MAGIC) ? 0 : -ENOMEM;
}
static struct file_system_type secretmem_fs = {
.name = "secretmem",
.init_fs_context = secretmem_init_fs_context,
.kill_sb = kill_anon_super,
};
static int secretmem_init(void)
{
int ret = 0;
if (!secretmem_enable)
return ret;
secretmem_mnt = kern_mount(&secretmem_fs);
if (IS_ERR(secretmem_mnt))
ret = PTR_ERR(secretmem_mnt);
/* prevent secretmem mappings from ever getting PROT_EXEC */
secretmem_mnt->mnt_flags |= MNT_NOEXEC;
return ret;
}
fs_initcall(secretmem_init);