linux/fs/erofs/data.c
Linus Torvalds c03098d4b9 gfs2: Fix mmap + page fault deadlocks
Functions gfs2_file_read_iter and gfs2_file_write_iter are both
 accessing the user buffer to write to or read from while holding the
 inode glock.  In the most basic scenario, that buffer will not be
 resident and it will be mapped to the same file.  Accessing the buffer
 will trigger a page fault, and gfs2 will deadlock trying to take the
 same inode glock again while trying to handle that fault.
 
 Fix that and similar, more complex scenarios by disabling page faults
 while accessing user buffers.  To make this work, introduce a small
 amount of new infrastructure and fix some bugs that didn't trigger so
 far, with page faults enabled.
 -----BEGIN PGP SIGNATURE-----
 
 iQJIBAABCAAyFiEEJZs3krPW0xkhLMTc1b+f6wMTZToFAmGBPisUHGFncnVlbmJh
 QHJlZGhhdC5jb20ACgkQ1b+f6wMTZTpE6A/7BezUnGuNJxJrR8pC+vcLYA7xAgUU
 6STQ6IN7w5UHRlSkNzZxZ2XPxW4uVQ4SxSEeaLqBsHZihepjcLNFZ/8MhQ6UPSD0
 8noHOi7CoIcp6IuWQtCpxRM/xjjm2SlMt2XbVJZaiJcdzCV9gB6TU9EkBRq7Zm/X
 9WFBbv1xZF0skn9ISCJvNtiiI+VyWKgMDUKxJUiTQjmJcklyyqHcVGmQi9BjqPz4
 4s3F+WH6CoGbDKlmNk/6Y9wZ/2+sbvGswVscUxPwJVPoZWsR1xBBUdAeAmEMD1P4
 BgE/Y1J8JXyVPYtyvZKq70XUhKdQkxB7RfX87YasOk9mY4Kjd5rIIGEykh+o2vC9
 kDhCHvf2Mnw5I6Rum3B7UXyB1vemY+fECIHsXhgBnS+ztabRtcAdpCuWoqb43ymw
 yEX1KwXyU4FpRYbrRvdZT42Fmh6ty8TW+N4swg8S2TrffirvgAi5yrcHZ4mPupYv
 lyzvsCW7Wv8hPXn/twNObX+okRgJnsxcCdBXARdCnRXfA8tH23xmu88u8RA1Vdxh
 nzTvv6Dx2EowwojuDWMx29Mw3fA2IqIfbOV+4FaRU7NZ2ZKtknL8yGl27qQUsMoJ
 vYsHTmagasjQr+NDJ3vQRLCw+JQ6B1hENpdkmixFD9moo7X1ZFW3HBi/UL973Bv6
 5CmgeXto8FRUFjI=
 =WeNd
 -----END PGP SIGNATURE-----

Merge tag 'gfs2-v5.15-rc5-mmap-fault' of git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2

Pull gfs2 mmap + page fault deadlocks fixes from Andreas Gruenbacher:
 "Functions gfs2_file_read_iter and gfs2_file_write_iter are both
  accessing the user buffer to write to or read from while holding the
  inode glock.

  In the most basic deadlock scenario, that buffer will not be resident
  and it will be mapped to the same file. Accessing the buffer will
  trigger a page fault, and gfs2 will deadlock trying to take the same
  inode glock again while trying to handle that fault.

  Fix that and similar, more complex scenarios by disabling page faults
  while accessing user buffers. To make this work, introduce a small
  amount of new infrastructure and fix some bugs that didn't trigger so
  far, with page faults enabled"

* tag 'gfs2-v5.15-rc5-mmap-fault' of git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2:
  gfs2: Fix mmap + page fault deadlocks for direct I/O
  iov_iter: Introduce nofault flag to disable page faults
  gup: Introduce FOLL_NOFAULT flag to disable page faults
  iomap: Add done_before argument to iomap_dio_rw
  iomap: Support partial direct I/O on user copy failures
  iomap: Fix iomap_dio_rw return value for user copies
  gfs2: Fix mmap + page fault deadlocks for buffered I/O
  gfs2: Eliminate ip->i_gh
  gfs2: Move the inode glock locking to gfs2_file_buffered_write
  gfs2: Introduce flag for glock holder auto-demotion
  gfs2: Clean up function may_grant
  gfs2: Add wrapper for iomap_file_buffered_write
  iov_iter: Introduce fault_in_iov_iter_writeable
  iov_iter: Turn iov_iter_fault_in_readable into fault_in_iov_iter_readable
  gup: Turn fault_in_pages_{readable,writeable} into fault_in_{readable,writeable}
  powerpc/kvm: Fix kvm_use_magic_page
  iov_iter: Fix iov_iter_get_pages{,_alloc} page fault return value
2021-11-02 12:25:03 -07:00

391 lines
9.7 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
* https://www.huawei.com/
* Copyright (C) 2021, Alibaba Cloud
*/
#include "internal.h"
#include <linux/prefetch.h>
#include <linux/dax.h>
#include <trace/events/erofs.h>
struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr)
{
struct address_space *const mapping = sb->s_bdev->bd_inode->i_mapping;
struct page *page;
page = read_cache_page_gfp(mapping, blkaddr,
mapping_gfp_constraint(mapping, ~__GFP_FS));
/* should already be PageUptodate */
if (!IS_ERR(page))
lock_page(page);
return page;
}
static int erofs_map_blocks_flatmode(struct inode *inode,
struct erofs_map_blocks *map,
int flags)
{
int err = 0;
erofs_blk_t nblocks, lastblk;
u64 offset = map->m_la;
struct erofs_inode *vi = EROFS_I(inode);
bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE);
trace_erofs_map_blocks_flatmode_enter(inode, map, flags);
nblocks = DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
lastblk = nblocks - tailendpacking;
/* there is no hole in flatmode */
map->m_flags = EROFS_MAP_MAPPED;
if (offset < blknr_to_addr(lastblk)) {
map->m_pa = blknr_to_addr(vi->raw_blkaddr) + map->m_la;
map->m_plen = blknr_to_addr(lastblk) - offset;
} else if (tailendpacking) {
/* 2 - inode inline B: inode, [xattrs], inline last blk... */
struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
map->m_pa = iloc(sbi, vi->nid) + vi->inode_isize +
vi->xattr_isize + erofs_blkoff(map->m_la);
map->m_plen = inode->i_size - offset;
/* inline data should be located in one meta block */
if (erofs_blkoff(map->m_pa) + map->m_plen > PAGE_SIZE) {
erofs_err(inode->i_sb,
"inline data cross block boundary @ nid %llu",
vi->nid);
DBG_BUGON(1);
err = -EFSCORRUPTED;
goto err_out;
}
map->m_flags |= EROFS_MAP_META;
} else {
erofs_err(inode->i_sb,
"internal error @ nid: %llu (size %llu), m_la 0x%llx",
vi->nid, inode->i_size, map->m_la);
DBG_BUGON(1);
err = -EIO;
goto err_out;
}
map->m_llen = map->m_plen;
err_out:
trace_erofs_map_blocks_flatmode_exit(inode, map, flags, 0);
return err;
}
static int erofs_map_blocks(struct inode *inode,
struct erofs_map_blocks *map, int flags)
{
struct super_block *sb = inode->i_sb;
struct erofs_inode *vi = EROFS_I(inode);
struct erofs_inode_chunk_index *idx;
struct page *page;
u64 chunknr;
unsigned int unit;
erofs_off_t pos;
int err = 0;
map->m_deviceid = 0;
if (map->m_la >= inode->i_size) {
/* leave out-of-bound access unmapped */
map->m_flags = 0;
map->m_plen = 0;
goto out;
}
if (vi->datalayout != EROFS_INODE_CHUNK_BASED)
return erofs_map_blocks_flatmode(inode, map, flags);
if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)
unit = sizeof(*idx); /* chunk index */
else
unit = EROFS_BLOCK_MAP_ENTRY_SIZE; /* block map */
chunknr = map->m_la >> vi->chunkbits;
pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize +
vi->xattr_isize, unit) + unit * chunknr;
page = erofs_get_meta_page(inode->i_sb, erofs_blknr(pos));
if (IS_ERR(page))
return PTR_ERR(page);
map->m_la = chunknr << vi->chunkbits;
map->m_plen = min_t(erofs_off_t, 1UL << vi->chunkbits,
roundup(inode->i_size - map->m_la, EROFS_BLKSIZ));
/* handle block map */
if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) {
__le32 *blkaddr = page_address(page) + erofs_blkoff(pos);
if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) {
map->m_flags = 0;
} else {
map->m_pa = blknr_to_addr(le32_to_cpu(*blkaddr));
map->m_flags = EROFS_MAP_MAPPED;
}
goto out_unlock;
}
/* parse chunk indexes */
idx = page_address(page) + erofs_blkoff(pos);
switch (le32_to_cpu(idx->blkaddr)) {
case EROFS_NULL_ADDR:
map->m_flags = 0;
break;
default:
map->m_deviceid = le16_to_cpu(idx->device_id) &
EROFS_SB(sb)->device_id_mask;
map->m_pa = blknr_to_addr(le32_to_cpu(idx->blkaddr));
map->m_flags = EROFS_MAP_MAPPED;
break;
}
out_unlock:
unlock_page(page);
put_page(page);
out:
map->m_llen = map->m_plen;
return err;
}
int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
{
struct erofs_dev_context *devs = EROFS_SB(sb)->devs;
struct erofs_device_info *dif;
int id;
/* primary device by default */
map->m_bdev = sb->s_bdev;
map->m_daxdev = EROFS_SB(sb)->dax_dev;
if (map->m_deviceid) {
down_read(&devs->rwsem);
dif = idr_find(&devs->tree, map->m_deviceid - 1);
if (!dif) {
up_read(&devs->rwsem);
return -ENODEV;
}
map->m_bdev = dif->bdev;
map->m_daxdev = dif->dax_dev;
up_read(&devs->rwsem);
} else if (devs->extra_devices) {
down_read(&devs->rwsem);
idr_for_each_entry(&devs->tree, dif, id) {
erofs_off_t startoff, length;
if (!dif->mapped_blkaddr)
continue;
startoff = blknr_to_addr(dif->mapped_blkaddr);
length = blknr_to_addr(dif->blocks);
if (map->m_pa >= startoff &&
map->m_pa < startoff + length) {
map->m_pa -= startoff;
map->m_bdev = dif->bdev;
map->m_daxdev = dif->dax_dev;
break;
}
}
up_read(&devs->rwsem);
}
return 0;
}
static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
{
int ret;
struct erofs_map_blocks map;
struct erofs_map_dev mdev;
map.m_la = offset;
map.m_llen = length;
ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
if (ret < 0)
return ret;
mdev = (struct erofs_map_dev) {
.m_deviceid = map.m_deviceid,
.m_pa = map.m_pa,
};
ret = erofs_map_dev(inode->i_sb, &mdev);
if (ret)
return ret;
iomap->bdev = mdev.m_bdev;
iomap->dax_dev = mdev.m_daxdev;
iomap->offset = map.m_la;
iomap->length = map.m_llen;
iomap->flags = 0;
iomap->private = NULL;
if (!(map.m_flags & EROFS_MAP_MAPPED)) {
iomap->type = IOMAP_HOLE;
iomap->addr = IOMAP_NULL_ADDR;
if (!iomap->length)
iomap->length = length;
return 0;
}
if (map.m_flags & EROFS_MAP_META) {
struct page *ipage;
iomap->type = IOMAP_INLINE;
ipage = erofs_get_meta_page(inode->i_sb,
erofs_blknr(mdev.m_pa));
if (IS_ERR(ipage))
return PTR_ERR(ipage);
iomap->inline_data = page_address(ipage) +
erofs_blkoff(mdev.m_pa);
iomap->private = ipage;
} else {
iomap->type = IOMAP_MAPPED;
iomap->addr = mdev.m_pa;
}
return 0;
}
static int erofs_iomap_end(struct inode *inode, loff_t pos, loff_t length,
ssize_t written, unsigned int flags, struct iomap *iomap)
{
struct page *ipage = iomap->private;
if (ipage) {
DBG_BUGON(iomap->type != IOMAP_INLINE);
unlock_page(ipage);
put_page(ipage);
} else {
DBG_BUGON(iomap->type == IOMAP_INLINE);
}
return written;
}
static const struct iomap_ops erofs_iomap_ops = {
.iomap_begin = erofs_iomap_begin,
.iomap_end = erofs_iomap_end,
};
int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) {
#ifdef CONFIG_EROFS_FS_ZIP
return iomap_fiemap(inode, fieinfo, start, len,
&z_erofs_iomap_report_ops);
#else
return -EOPNOTSUPP;
#endif
}
return iomap_fiemap(inode, fieinfo, start, len, &erofs_iomap_ops);
}
/*
* since we dont have write or truncate flows, so no inode
* locking needs to be held at the moment.
*/
static int erofs_readpage(struct file *file, struct page *page)
{
return iomap_readpage(page, &erofs_iomap_ops);
}
static void erofs_readahead(struct readahead_control *rac)
{
return iomap_readahead(rac, &erofs_iomap_ops);
}
static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
{
return iomap_bmap(mapping, block, &erofs_iomap_ops);
}
static int erofs_prepare_dio(struct kiocb *iocb, struct iov_iter *to)
{
struct inode *inode = file_inode(iocb->ki_filp);
loff_t align = iocb->ki_pos | iov_iter_count(to) |
iov_iter_alignment(to);
struct block_device *bdev = inode->i_sb->s_bdev;
unsigned int blksize_mask;
if (bdev)
blksize_mask = (1 << ilog2(bdev_logical_block_size(bdev))) - 1;
else
blksize_mask = (1 << inode->i_blkbits) - 1;
if (align & blksize_mask)
return -EINVAL;
return 0;
}
static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
/* no need taking (shared) inode lock since it's a ro filesystem */
if (!iov_iter_count(to))
return 0;
#ifdef CONFIG_FS_DAX
if (IS_DAX(iocb->ki_filp->f_mapping->host))
return dax_iomap_rw(iocb, to, &erofs_iomap_ops);
#endif
if (iocb->ki_flags & IOCB_DIRECT) {
int err = erofs_prepare_dio(iocb, to);
if (!err)
return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
NULL, 0, 0);
if (err < 0)
return err;
}
return filemap_read(iocb, to, 0);
}
/* for uncompressed (aligned) files and raw access for other files */
const struct address_space_operations erofs_raw_access_aops = {
.readpage = erofs_readpage,
.readahead = erofs_readahead,
.bmap = erofs_bmap,
.direct_IO = noop_direct_IO,
};
#ifdef CONFIG_FS_DAX
static vm_fault_t erofs_dax_huge_fault(struct vm_fault *vmf,
enum page_entry_size pe_size)
{
return dax_iomap_fault(vmf, pe_size, NULL, NULL, &erofs_iomap_ops);
}
static vm_fault_t erofs_dax_fault(struct vm_fault *vmf)
{
return erofs_dax_huge_fault(vmf, PE_SIZE_PTE);
}
static const struct vm_operations_struct erofs_dax_vm_ops = {
.fault = erofs_dax_fault,
.huge_fault = erofs_dax_huge_fault,
};
static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
if (!IS_DAX(file_inode(file)))
return generic_file_readonly_mmap(file, vma);
if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
return -EINVAL;
vma->vm_ops = &erofs_dax_vm_ops;
vma->vm_flags |= VM_HUGEPAGE;
return 0;
}
#else
#define erofs_file_mmap generic_file_readonly_mmap
#endif
const struct file_operations erofs_file_fops = {
.llseek = generic_file_llseek,
.read_iter = erofs_file_read_iter,
.mmap = erofs_file_mmap,
.splice_read = generic_file_splice_read,
};