2019-07-23 00:26:24 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
/*
|
|
|
|
* fs/ext4/verity.c: fs-verity support for ext4
|
|
|
|
*
|
|
|
|
* Copyright 2019 Google LLC
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Implementation of fsverity_operations for ext4.
|
|
|
|
*
|
|
|
|
* ext4 stores the verity metadata (Merkle tree and fsverity_descriptor) past
|
|
|
|
* the end of the file, starting at the first 64K boundary beyond i_size. This
|
|
|
|
* approach works because (a) verity files are readonly, and (b) pages fully
|
|
|
|
* beyond i_size aren't visible to userspace but can be read/written internally
|
|
|
|
* by ext4 with only some relatively small changes to ext4. This approach
|
|
|
|
* avoids having to depend on the EA_INODE feature and on rearchitecturing
|
|
|
|
* ext4's xattr support to support paging multi-gigabyte xattrs into memory, and
|
|
|
|
* to support encrypting xattrs. Note that the verity metadata *must* be
|
|
|
|
* encrypted when the file is, since it contains hashes of the plaintext data.
|
|
|
|
*
|
|
|
|
* Using a 64K boundary rather than a 4K one keeps things ready for
|
|
|
|
* architectures with 64K pages, and it doesn't necessarily waste space on-disk
|
|
|
|
* since there can be a hole between i_size and the start of the Merkle tree.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/quotaops.h>
|
|
|
|
|
|
|
|
#include "ext4.h"
|
|
|
|
#include "ext4_extents.h"
|
|
|
|
#include "ext4_jbd2.h"
|
|
|
|
|
|
|
|
static inline loff_t ext4_verity_metadata_pos(const struct inode *inode)
|
|
|
|
{
|
|
|
|
return round_up(inode->i_size, 65536);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Read some verity metadata from the inode. __vfs_read() can't be used because
|
|
|
|
* we need to read beyond i_size.
|
|
|
|
*/
|
|
|
|
static int pagecache_read(struct inode *inode, void *buf, size_t count,
|
|
|
|
loff_t pos)
|
|
|
|
{
|
|
|
|
while (count) {
|
2023-03-25 02:01:28 +08:00
|
|
|
struct folio *folio;
|
|
|
|
size_t n;
|
2019-07-23 00:26:24 +08:00
|
|
|
|
2023-03-25 02:01:28 +08:00
|
|
|
folio = read_mapping_folio(inode->i_mapping, pos >> PAGE_SHIFT,
|
2019-07-23 00:26:24 +08:00
|
|
|
NULL);
|
2023-03-25 02:01:28 +08:00
|
|
|
if (IS_ERR(folio))
|
|
|
|
return PTR_ERR(folio);
|
2019-07-23 00:26:24 +08:00
|
|
|
|
2023-03-25 02:01:28 +08:00
|
|
|
n = memcpy_from_file_folio(buf, folio, pos, count);
|
|
|
|
folio_put(folio);
|
2019-07-23 00:26:24 +08:00
|
|
|
|
|
|
|
buf += n;
|
|
|
|
pos += n;
|
|
|
|
count -= n;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Write some verity metadata to the inode for FS_IOC_ENABLE_VERITY.
|
|
|
|
* kernel_write() can't be used because the file descriptor is readonly.
|
|
|
|
*/
|
|
|
|
static int pagecache_write(struct inode *inode, const void *buf, size_t count,
|
|
|
|
loff_t pos)
|
|
|
|
{
|
2022-03-04 02:43:29 +08:00
|
|
|
struct address_space *mapping = inode->i_mapping;
|
|
|
|
const struct address_space_operations *aops = mapping->a_ops;
|
|
|
|
|
2019-07-23 00:26:24 +08:00
|
|
|
if (pos + count > inode->i_sb->s_maxbytes)
|
|
|
|
return -EFBIG;
|
|
|
|
|
|
|
|
while (count) {
|
|
|
|
size_t n = min_t(size_t, count,
|
|
|
|
PAGE_SIZE - offset_in_page(pos));
|
|
|
|
struct page *page;
|
2022-11-21 19:21:30 +08:00
|
|
|
void *fsdata = NULL;
|
2019-07-23 00:26:24 +08:00
|
|
|
int res;
|
|
|
|
|
2022-03-04 02:43:29 +08:00
|
|
|
res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata);
|
2019-07-23 00:26:24 +08:00
|
|
|
if (res)
|
|
|
|
return res;
|
|
|
|
|
2021-02-08 03:04:23 +08:00
|
|
|
memcpy_to_page(page, offset_in_page(pos), buf, n);
|
2019-07-23 00:26:24 +08:00
|
|
|
|
2022-03-04 02:43:29 +08:00
|
|
|
res = aops->write_end(NULL, mapping, pos, n, n, page, fsdata);
|
2019-07-23 00:26:24 +08:00
|
|
|
if (res < 0)
|
|
|
|
return res;
|
|
|
|
if (res != n)
|
|
|
|
return -EIO;
|
|
|
|
|
|
|
|
buf += n;
|
|
|
|
pos += n;
|
|
|
|
count -= n;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int ext4_begin_enable_verity(struct file *filp)
|
|
|
|
{
|
|
|
|
struct inode *inode = file_inode(filp);
|
|
|
|
const int credits = 2; /* superblock and inode for ext4_orphan_add() */
|
|
|
|
handle_t *handle;
|
|
|
|
int err;
|
|
|
|
|
2020-05-28 23:00:02 +08:00
|
|
|
if (IS_DAX(inode) || ext4_test_inode_flag(inode, EXT4_INODE_DAX))
|
2020-05-28 22:59:56 +08:00
|
|
|
return -EINVAL;
|
|
|
|
|
2019-07-23 00:26:24 +08:00
|
|
|
if (ext4_verity_in_progress(inode))
|
|
|
|
return -EBUSY;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Since the file was opened readonly, we have to initialize the jbd
|
|
|
|
* inode and quotas here and not rely on ->open() doing it. This must
|
|
|
|
* be done before evicting the inline data.
|
|
|
|
*/
|
|
|
|
|
|
|
|
err = ext4_inode_attach_jinode(inode);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
err = dquot_initialize(inode);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
err = ext4_convert_inline_data(inode);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
|
|
|
|
ext4_warning_inode(inode,
|
|
|
|
"verity is only allowed on extent-based files");
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ext4 uses the last allocated block to find the verity descriptor, so
|
|
|
|
* we must remove any other blocks past EOF which might confuse things.
|
|
|
|
*/
|
|
|
|
err = ext4_truncate(inode);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
handle = ext4_journal_start(inode, EXT4_HT_INODE, credits);
|
|
|
|
if (IS_ERR(handle))
|
|
|
|
return PTR_ERR(handle);
|
|
|
|
|
|
|
|
err = ext4_orphan_add(handle, inode);
|
|
|
|
if (err == 0)
|
|
|
|
ext4_set_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
|
|
|
|
|
|
|
|
ext4_journal_stop(handle);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ext4 stores the verity descriptor beginning on the next filesystem block
|
|
|
|
* boundary after the Merkle tree. Then, the descriptor size is stored in the
|
|
|
|
* last 4 bytes of the last allocated filesystem block --- which is either the
|
|
|
|
* block in which the descriptor ends, or the next block after that if there
|
|
|
|
* weren't at least 4 bytes remaining.
|
|
|
|
*
|
|
|
|
* We can't simply store the descriptor in an xattr because it *must* be
|
|
|
|
* encrypted when ext4 encryption is used, but ext4 encryption doesn't encrypt
|
|
|
|
* xattrs. Also, if the descriptor includes a large signature blob it may be
|
|
|
|
* too large to store in an xattr without the EA_INODE feature.
|
|
|
|
*/
|
|
|
|
static int ext4_write_verity_descriptor(struct inode *inode, const void *desc,
|
|
|
|
size_t desc_size, u64 merkle_tree_size)
|
|
|
|
{
|
|
|
|
const u64 desc_pos = round_up(ext4_verity_metadata_pos(inode) +
|
|
|
|
merkle_tree_size, i_blocksize(inode));
|
|
|
|
const u64 desc_end = desc_pos + desc_size;
|
|
|
|
const __le32 desc_size_disk = cpu_to_le32(desc_size);
|
|
|
|
const u64 desc_size_pos = round_up(desc_end + sizeof(desc_size_disk),
|
|
|
|
i_blocksize(inode)) -
|
|
|
|
sizeof(desc_size_disk);
|
|
|
|
int err;
|
|
|
|
|
|
|
|
err = pagecache_write(inode, desc, desc_size, desc_pos);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
return pagecache_write(inode, &desc_size_disk, sizeof(desc_size_disk),
|
|
|
|
desc_size_pos);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int ext4_end_enable_verity(struct file *filp, const void *desc,
|
|
|
|
size_t desc_size, u64 merkle_tree_size)
|
|
|
|
{
|
|
|
|
struct inode *inode = file_inode(filp);
|
|
|
|
const int credits = 2; /* superblock and inode for ext4_orphan_del() */
|
|
|
|
handle_t *handle;
|
2021-03-03 04:04:19 +08:00
|
|
|
struct ext4_iloc iloc;
|
2019-07-23 00:26:24 +08:00
|
|
|
int err = 0;
|
|
|
|
|
2021-03-03 04:04:19 +08:00
|
|
|
/*
|
|
|
|
* If an error already occurred (which fs/verity/ signals by passing
|
|
|
|
* desc == NULL), then only clean-up is needed.
|
|
|
|
*/
|
|
|
|
if (desc == NULL)
|
|
|
|
goto cleanup;
|
2019-07-23 00:26:24 +08:00
|
|
|
|
2021-03-03 04:04:19 +08:00
|
|
|
/* Append the verity descriptor. */
|
|
|
|
err = ext4_write_verity_descriptor(inode, desc, desc_size,
|
|
|
|
merkle_tree_size);
|
|
|
|
if (err)
|
|
|
|
goto cleanup;
|
2019-07-23 00:26:24 +08:00
|
|
|
|
|
|
|
/*
|
2021-03-03 04:04:19 +08:00
|
|
|
* Write all pages (both data and verity metadata). Note that this must
|
|
|
|
* happen before clearing EXT4_STATE_VERITY_IN_PROGRESS; otherwise pages
|
|
|
|
* beyond i_size won't be written properly. For crash consistency, this
|
|
|
|
* also must happen before the verity inode flag gets persisted.
|
2019-07-23 00:26:24 +08:00
|
|
|
*/
|
2021-03-03 04:04:19 +08:00
|
|
|
err = filemap_write_and_wait(inode->i_mapping);
|
|
|
|
if (err)
|
|
|
|
goto cleanup;
|
2019-07-23 00:26:24 +08:00
|
|
|
|
2021-03-03 04:04:19 +08:00
|
|
|
/*
|
|
|
|
* Finally, set the verity inode flag and remove the inode from the
|
|
|
|
* orphan list (in a single transaction).
|
|
|
|
*/
|
2019-07-23 00:26:24 +08:00
|
|
|
|
|
|
|
handle = ext4_journal_start(inode, EXT4_HT_INODE, credits);
|
|
|
|
if (IS_ERR(handle)) {
|
2021-03-03 04:04:19 +08:00
|
|
|
err = PTR_ERR(handle);
|
|
|
|
goto cleanup;
|
2019-07-23 00:26:24 +08:00
|
|
|
}
|
|
|
|
|
2021-03-03 04:04:19 +08:00
|
|
|
err = ext4_orphan_del(handle, inode);
|
|
|
|
if (err)
|
|
|
|
goto stop_and_cleanup;
|
|
|
|
|
|
|
|
err = ext4_reserve_inode_write(handle, inode, &iloc);
|
|
|
|
if (err)
|
|
|
|
goto stop_and_cleanup;
|
2019-07-23 00:26:24 +08:00
|
|
|
|
2021-03-03 04:04:19 +08:00
|
|
|
ext4_set_inode_flag(inode, EXT4_INODE_VERITY);
|
|
|
|
ext4_set_inode_flags(inode, false);
|
|
|
|
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
|
|
|
|
if (err)
|
|
|
|
goto stop_and_cleanup;
|
2019-07-23 00:26:24 +08:00
|
|
|
|
|
|
|
ext4_journal_stop(handle);
|
2021-03-03 04:04:19 +08:00
|
|
|
|
|
|
|
ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
stop_and_cleanup:
|
|
|
|
ext4_journal_stop(handle);
|
|
|
|
cleanup:
|
|
|
|
/*
|
|
|
|
* Verity failed to be enabled, so clean up by truncating any verity
|
|
|
|
* metadata that was written beyond i_size (both from cache and from
|
|
|
|
* disk), removing the inode from the orphan list (if it wasn't done
|
|
|
|
* already), and clearing EXT4_STATE_VERITY_IN_PROGRESS.
|
|
|
|
*/
|
|
|
|
truncate_inode_pages(inode->i_mapping, inode->i_size);
|
|
|
|
ext4_truncate(inode);
|
|
|
|
ext4_orphan_del(NULL, inode);
|
|
|
|
ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
|
|
|
|
return err;
|
2019-07-23 00:26:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int ext4_get_verity_descriptor_location(struct inode *inode,
|
|
|
|
size_t *desc_size_ret,
|
|
|
|
u64 *desc_pos_ret)
|
|
|
|
{
|
|
|
|
struct ext4_ext_path *path;
|
|
|
|
struct ext4_extent *last_extent;
|
|
|
|
u32 end_lblk;
|
|
|
|
u64 desc_size_pos;
|
|
|
|
__le32 desc_size_disk;
|
|
|
|
u32 desc_size;
|
|
|
|
u64 desc_pos;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Descriptor size is in last 4 bytes of last allocated block.
|
|
|
|
* See ext4_write_verity_descriptor().
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
|
|
|
|
EXT4_ERROR_INODE(inode, "verity file doesn't use extents");
|
|
|
|
return -EFSCORRUPTED;
|
|
|
|
}
|
|
|
|
|
|
|
|
path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
|
|
|
|
if (IS_ERR(path))
|
|
|
|
return PTR_ERR(path);
|
|
|
|
|
|
|
|
last_extent = path[path->p_depth].p_ext;
|
|
|
|
if (!last_extent) {
|
|
|
|
EXT4_ERROR_INODE(inode, "verity file has no extents");
|
2022-09-24 10:12:11 +08:00
|
|
|
ext4_free_ext_path(path);
|
2019-07-23 00:26:24 +08:00
|
|
|
return -EFSCORRUPTED;
|
|
|
|
}
|
|
|
|
|
|
|
|
end_lblk = le32_to_cpu(last_extent->ee_block) +
|
|
|
|
ext4_ext_get_actual_len(last_extent);
|
|
|
|
desc_size_pos = (u64)end_lblk << inode->i_blkbits;
|
2022-09-24 10:12:11 +08:00
|
|
|
ext4_free_ext_path(path);
|
2019-07-23 00:26:24 +08:00
|
|
|
|
|
|
|
if (desc_size_pos < sizeof(desc_size_disk))
|
|
|
|
goto bad;
|
|
|
|
desc_size_pos -= sizeof(desc_size_disk);
|
|
|
|
|
|
|
|
err = pagecache_read(inode, &desc_size_disk, sizeof(desc_size_disk),
|
|
|
|
desc_size_pos);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
desc_size = le32_to_cpu(desc_size_disk);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The descriptor is stored just before the desc_size_disk, but starting
|
|
|
|
* on a filesystem block boundary.
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (desc_size > INT_MAX || desc_size > desc_size_pos)
|
|
|
|
goto bad;
|
|
|
|
|
|
|
|
desc_pos = round_down(desc_size_pos - desc_size, i_blocksize(inode));
|
|
|
|
if (desc_pos < ext4_verity_metadata_pos(inode))
|
|
|
|
goto bad;
|
|
|
|
|
|
|
|
*desc_size_ret = desc_size;
|
|
|
|
*desc_pos_ret = desc_pos;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
bad:
|
|
|
|
EXT4_ERROR_INODE(inode, "verity file corrupted; can't find descriptor");
|
|
|
|
return -EFSCORRUPTED;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int ext4_get_verity_descriptor(struct inode *inode, void *buf,
|
|
|
|
size_t buf_size)
|
|
|
|
{
|
|
|
|
size_t desc_size = 0;
|
|
|
|
u64 desc_pos = 0;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
err = ext4_get_verity_descriptor_location(inode, &desc_size, &desc_pos);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
if (buf_size) {
|
|
|
|
if (desc_size > buf_size)
|
|
|
|
return -ERANGE;
|
|
|
|
err = pagecache_read(inode, buf, desc_size, desc_pos);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
return desc_size;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct page *ext4_read_merkle_tree_page(struct inode *inode,
|
fs-verity: implement readahead of Merkle tree pages
When fs-verity verifies data pages, currently it reads each Merkle tree
page synchronously using read_mapping_page().
Therefore, when the Merkle tree pages aren't already cached, fs-verity
causes an extra 4 KiB I/O request for every 512 KiB of data (assuming
that the Merkle tree uses SHA-256 and 4 KiB blocks). This results in
more I/O requests and performance loss than is strictly necessary.
Therefore, implement readahead of the Merkle tree pages.
For simplicity, we take advantage of the fact that the kernel already
does readahead of the file's *data*, just like it does for any other
file. Due to this, we don't really need a separate readahead state
(struct file_ra_state) just for the Merkle tree, but rather we just need
to piggy-back on the existing data readahead requests.
We also only really need to bother with the first level of the Merkle
tree, since the usual fan-out factor is 128, so normally over 99% of
Merkle tree I/O requests are for the first level.
Therefore, make fsverity_verify_bio() enable readahead of the first
Merkle tree level, for up to 1/4 the number of pages in the bio, when it
sees that the REQ_RAHEAD flag is set on the bio. The readahead size is
then passed down to ->read_merkle_tree_page() for the filesystem to
(optionally) implement if it sees that the requested page is uncached.
While we're at it, also make build_merkle_tree_level() set the Merkle
tree readahead size, since it's easy to do there.
However, for now don't set the readahead size in fsverity_verify_page(),
since currently it's only used to verify holes on ext4 and f2fs, and it
would need parameters added to know how much to read ahead.
This patch significantly improves fs-verity sequential read performance.
Some quick benchmarks with 'cat'-ing a 250MB file after dropping caches:
On an ARM64 phone (using sha256-ce):
Before: 217 MB/s
After: 263 MB/s
(compare to sha256sum of non-verity file: 357 MB/s)
In an x86_64 VM (using sha256-avx2):
Before: 173 MB/s
After: 215 MB/s
(compare to sha256sum of non-verity file: 223 MB/s)
Link: https://lore.kernel.org/r/20200106205533.137005-1-ebiggers@kernel.org
Reviewed-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2020-01-07 04:55:33 +08:00
|
|
|
pgoff_t index,
|
|
|
|
unsigned long num_ra_pages)
|
2019-07-23 00:26:24 +08:00
|
|
|
{
|
2023-03-25 02:01:29 +08:00
|
|
|
struct folio *folio;
|
fs-verity: implement readahead of Merkle tree pages
When fs-verity verifies data pages, currently it reads each Merkle tree
page synchronously using read_mapping_page().
Therefore, when the Merkle tree pages aren't already cached, fs-verity
causes an extra 4 KiB I/O request for every 512 KiB of data (assuming
that the Merkle tree uses SHA-256 and 4 KiB blocks). This results in
more I/O requests and performance loss than is strictly necessary.
Therefore, implement readahead of the Merkle tree pages.
For simplicity, we take advantage of the fact that the kernel already
does readahead of the file's *data*, just like it does for any other
file. Due to this, we don't really need a separate readahead state
(struct file_ra_state) just for the Merkle tree, but rather we just need
to piggy-back on the existing data readahead requests.
We also only really need to bother with the first level of the Merkle
tree, since the usual fan-out factor is 128, so normally over 99% of
Merkle tree I/O requests are for the first level.
Therefore, make fsverity_verify_bio() enable readahead of the first
Merkle tree level, for up to 1/4 the number of pages in the bio, when it
sees that the REQ_RAHEAD flag is set on the bio. The readahead size is
then passed down to ->read_merkle_tree_page() for the filesystem to
(optionally) implement if it sees that the requested page is uncached.
While we're at it, also make build_merkle_tree_level() set the Merkle
tree readahead size, since it's easy to do there.
However, for now don't set the readahead size in fsverity_verify_page(),
since currently it's only used to verify holes on ext4 and f2fs, and it
would need parameters added to know how much to read ahead.
This patch significantly improves fs-verity sequential read performance.
Some quick benchmarks with 'cat'-ing a 250MB file after dropping caches:
On an ARM64 phone (using sha256-ce):
Before: 217 MB/s
After: 263 MB/s
(compare to sha256sum of non-verity file: 357 MB/s)
In an x86_64 VM (using sha256-avx2):
Before: 173 MB/s
After: 215 MB/s
(compare to sha256sum of non-verity file: 223 MB/s)
Link: https://lore.kernel.org/r/20200106205533.137005-1-ebiggers@kernel.org
Reviewed-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2020-01-07 04:55:33 +08:00
|
|
|
|
2019-07-23 00:26:24 +08:00
|
|
|
index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT;
|
|
|
|
|
2023-03-25 02:01:29 +08:00
|
|
|
folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0);
|
- Nick Piggin's "shoot lazy tlbs" series, to improve the peformance of
switching from a user process to a kernel thread.
- More folio conversions from Kefeng Wang, Zhang Peng and Pankaj Raghav.
- zsmalloc performance improvements from Sergey Senozhatsky.
- Yue Zhao has found and fixed some data race issues around the
alteration of memcg userspace tunables.
- VFS rationalizations from Christoph Hellwig:
- removal of most of the callers of write_one_page().
- make __filemap_get_folio()'s return value more useful
- Luis Chamberlain has changed tmpfs so it no longer requires swap
backing. Use `mount -o noswap'.
- Qi Zheng has made the slab shrinkers operate locklessly, providing
some scalability benefits.
- Keith Busch has improved dmapool's performance, making part of its
operations O(1) rather than O(n).
- Peter Xu adds the UFFD_FEATURE_WP_UNPOPULATED feature to userfaultd,
permitting userspace to wr-protect anon memory unpopulated ptes.
- Kirill Shutemov has changed MAX_ORDER's meaning to be inclusive rather
than exclusive, and has fixed a bunch of errors which were caused by its
unintuitive meaning.
- Axel Rasmussen give userfaultfd the UFFDIO_CONTINUE_MODE_WP feature,
which causes minor faults to install a write-protected pte.
- Vlastimil Babka has done some maintenance work on vma_merge():
cleanups to the kernel code and improvements to our userspace test
harness.
- Cleanups to do_fault_around() by Lorenzo Stoakes.
- Mike Rapoport has moved a lot of initialization code out of various
mm/ files and into mm/mm_init.c.
- Lorenzo Stoakes removd vmf_insert_mixed_prot(), which was added for
DRM, but DRM doesn't use it any more.
- Lorenzo has also coverted read_kcore() and vread() to use iterators
and has thereby removed the use of bounce buffers in some cases.
- Lorenzo has also contributed further cleanups of vma_merge().
- Chaitanya Prakash provides some fixes to the mmap selftesting code.
- Matthew Wilcox changes xfs and afs so they no longer take sleeping
locks in ->map_page(), a step towards RCUification of pagefaults.
- Suren Baghdasaryan has improved mmap_lock scalability by switching to
per-VMA locking.
- Frederic Weisbecker has reworked the percpu cache draining so that it
no longer causes latency glitches on cpu isolated workloads.
- Mike Rapoport cleans up and corrects the ARCH_FORCE_MAX_ORDER Kconfig
logic.
- Liu Shixin has changed zswap's initialization so we no longer waste a
chunk of memory if zswap is not being used.
- Yosry Ahmed has improved the performance of memcg statistics flushing.
- David Stevens has fixed several issues involving khugepaged,
userfaultfd and shmem.
- Christoph Hellwig has provided some cleanup work to zram's IO-related
code paths.
- David Hildenbrand has fixed up some issues in the selftest code's
testing of our pte state changing.
- Pankaj Raghav has made page_endio() unneeded and has removed it.
- Peter Xu contributed some rationalizations of the userfaultfd
selftests.
- Yosry Ahmed has fixed an issue around memcg's page recalim accounting.
- Chaitanya Prakash has fixed some arm-related issues in the
selftests/mm code.
- Longlong Xia has improved the way in which KSM handles hwpoisoned
pages.
- Peter Xu fixes a few issues with uffd-wp at fork() time.
- Stefan Roesch has changed KSM so that it may now be used on a
per-process and per-cgroup basis.
-----BEGIN PGP SIGNATURE-----
iHUEABYIAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCZEr3zQAKCRDdBJ7gKXxA
jlLoAP0fpQBipwFxED0Us4SKQfupV6z4caXNJGPeay7Aj11/kQD/aMRC2uPfgr96
eMG3kwn2pqkB9ST2QpkaRbxA//eMbQY=
=J+Dj
-----END PGP SIGNATURE-----
Merge tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton:
- Nick Piggin's "shoot lazy tlbs" series, to improve the peformance of
switching from a user process to a kernel thread.
- More folio conversions from Kefeng Wang, Zhang Peng and Pankaj
Raghav.
- zsmalloc performance improvements from Sergey Senozhatsky.
- Yue Zhao has found and fixed some data race issues around the
alteration of memcg userspace tunables.
- VFS rationalizations from Christoph Hellwig:
- removal of most of the callers of write_one_page()
- make __filemap_get_folio()'s return value more useful
- Luis Chamberlain has changed tmpfs so it no longer requires swap
backing. Use `mount -o noswap'.
- Qi Zheng has made the slab shrinkers operate locklessly, providing
some scalability benefits.
- Keith Busch has improved dmapool's performance, making part of its
operations O(1) rather than O(n).
- Peter Xu adds the UFFD_FEATURE_WP_UNPOPULATED feature to userfaultd,
permitting userspace to wr-protect anon memory unpopulated ptes.
- Kirill Shutemov has changed MAX_ORDER's meaning to be inclusive
rather than exclusive, and has fixed a bunch of errors which were
caused by its unintuitive meaning.
- Axel Rasmussen give userfaultfd the UFFDIO_CONTINUE_MODE_WP feature,
which causes minor faults to install a write-protected pte.
- Vlastimil Babka has done some maintenance work on vma_merge():
cleanups to the kernel code and improvements to our userspace test
harness.
- Cleanups to do_fault_around() by Lorenzo Stoakes.
- Mike Rapoport has moved a lot of initialization code out of various
mm/ files and into mm/mm_init.c.
- Lorenzo Stoakes removd vmf_insert_mixed_prot(), which was added for
DRM, but DRM doesn't use it any more.
- Lorenzo has also coverted read_kcore() and vread() to use iterators
and has thereby removed the use of bounce buffers in some cases.
- Lorenzo has also contributed further cleanups of vma_merge().
- Chaitanya Prakash provides some fixes to the mmap selftesting code.
- Matthew Wilcox changes xfs and afs so they no longer take sleeping
locks in ->map_page(), a step towards RCUification of pagefaults.
- Suren Baghdasaryan has improved mmap_lock scalability by switching to
per-VMA locking.
- Frederic Weisbecker has reworked the percpu cache draining so that it
no longer causes latency glitches on cpu isolated workloads.
- Mike Rapoport cleans up and corrects the ARCH_FORCE_MAX_ORDER Kconfig
logic.
- Liu Shixin has changed zswap's initialization so we no longer waste a
chunk of memory if zswap is not being used.
- Yosry Ahmed has improved the performance of memcg statistics
flushing.
- David Stevens has fixed several issues involving khugepaged,
userfaultfd and shmem.
- Christoph Hellwig has provided some cleanup work to zram's IO-related
code paths.
- David Hildenbrand has fixed up some issues in the selftest code's
testing of our pte state changing.
- Pankaj Raghav has made page_endio() unneeded and has removed it.
- Peter Xu contributed some rationalizations of the userfaultfd
selftests.
- Yosry Ahmed has fixed an issue around memcg's page recalim
accounting.
- Chaitanya Prakash has fixed some arm-related issues in the
selftests/mm code.
- Longlong Xia has improved the way in which KSM handles hwpoisoned
pages.
- Peter Xu fixes a few issues with uffd-wp at fork() time.
- Stefan Roesch has changed KSM so that it may now be used on a
per-process and per-cgroup basis.
* tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (369 commits)
mm,unmap: avoid flushing TLB in batch if PTE is inaccessible
shmem: restrict noswap option to initial user namespace
mm/khugepaged: fix conflicting mods to collapse_file()
sparse: remove unnecessary 0 values from rc
mm: move 'mmap_min_addr' logic from callers into vm_unmapped_area()
hugetlb: pte_alloc_huge() to replace huge pte_alloc_map()
maple_tree: fix allocation in mas_sparse_area()
mm: do not increment pgfault stats when page fault handler retries
zsmalloc: allow only one active pool compaction context
selftests/mm: add new selftests for KSM
mm: add new KSM process and sysfs knobs
mm: add new api to enable ksm per process
mm: shrinkers: fix debugfs file permissions
mm: don't check VMA write permissions if the PTE/PMD indicates write permissions
migrate_pages_batch: fix statistics for longterm pin retry
userfaultfd: use helper function range_in_vma()
lib/show_mem.c: use for_each_populated_zone() simplify code
mm: correct arg in reclaim_pages()/reclaim_clean_pages_from_list()
fs/buffer: convert create_page_buffers to folio_create_buffers
fs/buffer: add folio_create_empty_buffers helper
...
2023-04-28 10:42:02 +08:00
|
|
|
if (IS_ERR(folio) || !folio_test_uptodate(folio)) {
|
2022-10-13 03:34:19 +08:00
|
|
|
DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
|
|
|
|
|
- Nick Piggin's "shoot lazy tlbs" series, to improve the peformance of
switching from a user process to a kernel thread.
- More folio conversions from Kefeng Wang, Zhang Peng and Pankaj Raghav.
- zsmalloc performance improvements from Sergey Senozhatsky.
- Yue Zhao has found and fixed some data race issues around the
alteration of memcg userspace tunables.
- VFS rationalizations from Christoph Hellwig:
- removal of most of the callers of write_one_page().
- make __filemap_get_folio()'s return value more useful
- Luis Chamberlain has changed tmpfs so it no longer requires swap
backing. Use `mount -o noswap'.
- Qi Zheng has made the slab shrinkers operate locklessly, providing
some scalability benefits.
- Keith Busch has improved dmapool's performance, making part of its
operations O(1) rather than O(n).
- Peter Xu adds the UFFD_FEATURE_WP_UNPOPULATED feature to userfaultd,
permitting userspace to wr-protect anon memory unpopulated ptes.
- Kirill Shutemov has changed MAX_ORDER's meaning to be inclusive rather
than exclusive, and has fixed a bunch of errors which were caused by its
unintuitive meaning.
- Axel Rasmussen give userfaultfd the UFFDIO_CONTINUE_MODE_WP feature,
which causes minor faults to install a write-protected pte.
- Vlastimil Babka has done some maintenance work on vma_merge():
cleanups to the kernel code and improvements to our userspace test
harness.
- Cleanups to do_fault_around() by Lorenzo Stoakes.
- Mike Rapoport has moved a lot of initialization code out of various
mm/ files and into mm/mm_init.c.
- Lorenzo Stoakes removd vmf_insert_mixed_prot(), which was added for
DRM, but DRM doesn't use it any more.
- Lorenzo has also coverted read_kcore() and vread() to use iterators
and has thereby removed the use of bounce buffers in some cases.
- Lorenzo has also contributed further cleanups of vma_merge().
- Chaitanya Prakash provides some fixes to the mmap selftesting code.
- Matthew Wilcox changes xfs and afs so they no longer take sleeping
locks in ->map_page(), a step towards RCUification of pagefaults.
- Suren Baghdasaryan has improved mmap_lock scalability by switching to
per-VMA locking.
- Frederic Weisbecker has reworked the percpu cache draining so that it
no longer causes latency glitches on cpu isolated workloads.
- Mike Rapoport cleans up and corrects the ARCH_FORCE_MAX_ORDER Kconfig
logic.
- Liu Shixin has changed zswap's initialization so we no longer waste a
chunk of memory if zswap is not being used.
- Yosry Ahmed has improved the performance of memcg statistics flushing.
- David Stevens has fixed several issues involving khugepaged,
userfaultfd and shmem.
- Christoph Hellwig has provided some cleanup work to zram's IO-related
code paths.
- David Hildenbrand has fixed up some issues in the selftest code's
testing of our pte state changing.
- Pankaj Raghav has made page_endio() unneeded and has removed it.
- Peter Xu contributed some rationalizations of the userfaultfd
selftests.
- Yosry Ahmed has fixed an issue around memcg's page recalim accounting.
- Chaitanya Prakash has fixed some arm-related issues in the
selftests/mm code.
- Longlong Xia has improved the way in which KSM handles hwpoisoned
pages.
- Peter Xu fixes a few issues with uffd-wp at fork() time.
- Stefan Roesch has changed KSM so that it may now be used on a
per-process and per-cgroup basis.
-----BEGIN PGP SIGNATURE-----
iHUEABYIAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCZEr3zQAKCRDdBJ7gKXxA
jlLoAP0fpQBipwFxED0Us4SKQfupV6z4caXNJGPeay7Aj11/kQD/aMRC2uPfgr96
eMG3kwn2pqkB9ST2QpkaRbxA//eMbQY=
=J+Dj
-----END PGP SIGNATURE-----
Merge tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton:
- Nick Piggin's "shoot lazy tlbs" series, to improve the peformance of
switching from a user process to a kernel thread.
- More folio conversions from Kefeng Wang, Zhang Peng and Pankaj
Raghav.
- zsmalloc performance improvements from Sergey Senozhatsky.
- Yue Zhao has found and fixed some data race issues around the
alteration of memcg userspace tunables.
- VFS rationalizations from Christoph Hellwig:
- removal of most of the callers of write_one_page()
- make __filemap_get_folio()'s return value more useful
- Luis Chamberlain has changed tmpfs so it no longer requires swap
backing. Use `mount -o noswap'.
- Qi Zheng has made the slab shrinkers operate locklessly, providing
some scalability benefits.
- Keith Busch has improved dmapool's performance, making part of its
operations O(1) rather than O(n).
- Peter Xu adds the UFFD_FEATURE_WP_UNPOPULATED feature to userfaultd,
permitting userspace to wr-protect anon memory unpopulated ptes.
- Kirill Shutemov has changed MAX_ORDER's meaning to be inclusive
rather than exclusive, and has fixed a bunch of errors which were
caused by its unintuitive meaning.
- Axel Rasmussen give userfaultfd the UFFDIO_CONTINUE_MODE_WP feature,
which causes minor faults to install a write-protected pte.
- Vlastimil Babka has done some maintenance work on vma_merge():
cleanups to the kernel code and improvements to our userspace test
harness.
- Cleanups to do_fault_around() by Lorenzo Stoakes.
- Mike Rapoport has moved a lot of initialization code out of various
mm/ files and into mm/mm_init.c.
- Lorenzo Stoakes removd vmf_insert_mixed_prot(), which was added for
DRM, but DRM doesn't use it any more.
- Lorenzo has also coverted read_kcore() and vread() to use iterators
and has thereby removed the use of bounce buffers in some cases.
- Lorenzo has also contributed further cleanups of vma_merge().
- Chaitanya Prakash provides some fixes to the mmap selftesting code.
- Matthew Wilcox changes xfs and afs so they no longer take sleeping
locks in ->map_page(), a step towards RCUification of pagefaults.
- Suren Baghdasaryan has improved mmap_lock scalability by switching to
per-VMA locking.
- Frederic Weisbecker has reworked the percpu cache draining so that it
no longer causes latency glitches on cpu isolated workloads.
- Mike Rapoport cleans up and corrects the ARCH_FORCE_MAX_ORDER Kconfig
logic.
- Liu Shixin has changed zswap's initialization so we no longer waste a
chunk of memory if zswap is not being used.
- Yosry Ahmed has improved the performance of memcg statistics
flushing.
- David Stevens has fixed several issues involving khugepaged,
userfaultfd and shmem.
- Christoph Hellwig has provided some cleanup work to zram's IO-related
code paths.
- David Hildenbrand has fixed up some issues in the selftest code's
testing of our pte state changing.
- Pankaj Raghav has made page_endio() unneeded and has removed it.
- Peter Xu contributed some rationalizations of the userfaultfd
selftests.
- Yosry Ahmed has fixed an issue around memcg's page recalim
accounting.
- Chaitanya Prakash has fixed some arm-related issues in the
selftests/mm code.
- Longlong Xia has improved the way in which KSM handles hwpoisoned
pages.
- Peter Xu fixes a few issues with uffd-wp at fork() time.
- Stefan Roesch has changed KSM so that it may now be used on a
per-process and per-cgroup basis.
* tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (369 commits)
mm,unmap: avoid flushing TLB in batch if PTE is inaccessible
shmem: restrict noswap option to initial user namespace
mm/khugepaged: fix conflicting mods to collapse_file()
sparse: remove unnecessary 0 values from rc
mm: move 'mmap_min_addr' logic from callers into vm_unmapped_area()
hugetlb: pte_alloc_huge() to replace huge pte_alloc_map()
maple_tree: fix allocation in mas_sparse_area()
mm: do not increment pgfault stats when page fault handler retries
zsmalloc: allow only one active pool compaction context
selftests/mm: add new selftests for KSM
mm: add new KSM process and sysfs knobs
mm: add new api to enable ksm per process
mm: shrinkers: fix debugfs file permissions
mm: don't check VMA write permissions if the PTE/PMD indicates write permissions
migrate_pages_batch: fix statistics for longterm pin retry
userfaultfd: use helper function range_in_vma()
lib/show_mem.c: use for_each_populated_zone() simplify code
mm: correct arg in reclaim_pages()/reclaim_clean_pages_from_list()
fs/buffer: convert create_page_buffers to folio_create_buffers
fs/buffer: add folio_create_empty_buffers helper
...
2023-04-28 10:42:02 +08:00
|
|
|
if (!IS_ERR(folio))
|
2023-03-25 02:01:29 +08:00
|
|
|
folio_put(folio);
|
fs-verity: implement readahead of Merkle tree pages
When fs-verity verifies data pages, currently it reads each Merkle tree
page synchronously using read_mapping_page().
Therefore, when the Merkle tree pages aren't already cached, fs-verity
causes an extra 4 KiB I/O request for every 512 KiB of data (assuming
that the Merkle tree uses SHA-256 and 4 KiB blocks). This results in
more I/O requests and performance loss than is strictly necessary.
Therefore, implement readahead of the Merkle tree pages.
For simplicity, we take advantage of the fact that the kernel already
does readahead of the file's *data*, just like it does for any other
file. Due to this, we don't really need a separate readahead state
(struct file_ra_state) just for the Merkle tree, but rather we just need
to piggy-back on the existing data readahead requests.
We also only really need to bother with the first level of the Merkle
tree, since the usual fan-out factor is 128, so normally over 99% of
Merkle tree I/O requests are for the first level.
Therefore, make fsverity_verify_bio() enable readahead of the first
Merkle tree level, for up to 1/4 the number of pages in the bio, when it
sees that the REQ_RAHEAD flag is set on the bio. The readahead size is
then passed down to ->read_merkle_tree_page() for the filesystem to
(optionally) implement if it sees that the requested page is uncached.
While we're at it, also make build_merkle_tree_level() set the Merkle
tree readahead size, since it's easy to do there.
However, for now don't set the readahead size in fsverity_verify_page(),
since currently it's only used to verify holes on ext4 and f2fs, and it
would need parameters added to know how much to read ahead.
This patch significantly improves fs-verity sequential read performance.
Some quick benchmarks with 'cat'-ing a 250MB file after dropping caches:
On an ARM64 phone (using sha256-ce):
Before: 217 MB/s
After: 263 MB/s
(compare to sha256sum of non-verity file: 357 MB/s)
In an x86_64 VM (using sha256-avx2):
Before: 173 MB/s
After: 215 MB/s
(compare to sha256sum of non-verity file: 223 MB/s)
Link: https://lore.kernel.org/r/20200106205533.137005-1-ebiggers@kernel.org
Reviewed-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2020-01-07 04:55:33 +08:00
|
|
|
else if (num_ra_pages > 1)
|
2020-10-16 11:06:14 +08:00
|
|
|
page_cache_ra_unbounded(&ractl, num_ra_pages, 0);
|
2023-03-25 02:01:29 +08:00
|
|
|
folio = read_mapping_folio(inode->i_mapping, index, NULL);
|
- Nick Piggin's "shoot lazy tlbs" series, to improve the peformance of
switching from a user process to a kernel thread.
- More folio conversions from Kefeng Wang, Zhang Peng and Pankaj Raghav.
- zsmalloc performance improvements from Sergey Senozhatsky.
- Yue Zhao has found and fixed some data race issues around the
alteration of memcg userspace tunables.
- VFS rationalizations from Christoph Hellwig:
- removal of most of the callers of write_one_page().
- make __filemap_get_folio()'s return value more useful
- Luis Chamberlain has changed tmpfs so it no longer requires swap
backing. Use `mount -o noswap'.
- Qi Zheng has made the slab shrinkers operate locklessly, providing
some scalability benefits.
- Keith Busch has improved dmapool's performance, making part of its
operations O(1) rather than O(n).
- Peter Xu adds the UFFD_FEATURE_WP_UNPOPULATED feature to userfaultd,
permitting userspace to wr-protect anon memory unpopulated ptes.
- Kirill Shutemov has changed MAX_ORDER's meaning to be inclusive rather
than exclusive, and has fixed a bunch of errors which were caused by its
unintuitive meaning.
- Axel Rasmussen give userfaultfd the UFFDIO_CONTINUE_MODE_WP feature,
which causes minor faults to install a write-protected pte.
- Vlastimil Babka has done some maintenance work on vma_merge():
cleanups to the kernel code and improvements to our userspace test
harness.
- Cleanups to do_fault_around() by Lorenzo Stoakes.
- Mike Rapoport has moved a lot of initialization code out of various
mm/ files and into mm/mm_init.c.
- Lorenzo Stoakes removd vmf_insert_mixed_prot(), which was added for
DRM, but DRM doesn't use it any more.
- Lorenzo has also coverted read_kcore() and vread() to use iterators
and has thereby removed the use of bounce buffers in some cases.
- Lorenzo has also contributed further cleanups of vma_merge().
- Chaitanya Prakash provides some fixes to the mmap selftesting code.
- Matthew Wilcox changes xfs and afs so they no longer take sleeping
locks in ->map_page(), a step towards RCUification of pagefaults.
- Suren Baghdasaryan has improved mmap_lock scalability by switching to
per-VMA locking.
- Frederic Weisbecker has reworked the percpu cache draining so that it
no longer causes latency glitches on cpu isolated workloads.
- Mike Rapoport cleans up and corrects the ARCH_FORCE_MAX_ORDER Kconfig
logic.
- Liu Shixin has changed zswap's initialization so we no longer waste a
chunk of memory if zswap is not being used.
- Yosry Ahmed has improved the performance of memcg statistics flushing.
- David Stevens has fixed several issues involving khugepaged,
userfaultfd and shmem.
- Christoph Hellwig has provided some cleanup work to zram's IO-related
code paths.
- David Hildenbrand has fixed up some issues in the selftest code's
testing of our pte state changing.
- Pankaj Raghav has made page_endio() unneeded and has removed it.
- Peter Xu contributed some rationalizations of the userfaultfd
selftests.
- Yosry Ahmed has fixed an issue around memcg's page recalim accounting.
- Chaitanya Prakash has fixed some arm-related issues in the
selftests/mm code.
- Longlong Xia has improved the way in which KSM handles hwpoisoned
pages.
- Peter Xu fixes a few issues with uffd-wp at fork() time.
- Stefan Roesch has changed KSM so that it may now be used on a
per-process and per-cgroup basis.
-----BEGIN PGP SIGNATURE-----
iHUEABYIAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCZEr3zQAKCRDdBJ7gKXxA
jlLoAP0fpQBipwFxED0Us4SKQfupV6z4caXNJGPeay7Aj11/kQD/aMRC2uPfgr96
eMG3kwn2pqkB9ST2QpkaRbxA//eMbQY=
=J+Dj
-----END PGP SIGNATURE-----
Merge tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton:
- Nick Piggin's "shoot lazy tlbs" series, to improve the peformance of
switching from a user process to a kernel thread.
- More folio conversions from Kefeng Wang, Zhang Peng and Pankaj
Raghav.
- zsmalloc performance improvements from Sergey Senozhatsky.
- Yue Zhao has found and fixed some data race issues around the
alteration of memcg userspace tunables.
- VFS rationalizations from Christoph Hellwig:
- removal of most of the callers of write_one_page()
- make __filemap_get_folio()'s return value more useful
- Luis Chamberlain has changed tmpfs so it no longer requires swap
backing. Use `mount -o noswap'.
- Qi Zheng has made the slab shrinkers operate locklessly, providing
some scalability benefits.
- Keith Busch has improved dmapool's performance, making part of its
operations O(1) rather than O(n).
- Peter Xu adds the UFFD_FEATURE_WP_UNPOPULATED feature to userfaultd,
permitting userspace to wr-protect anon memory unpopulated ptes.
- Kirill Shutemov has changed MAX_ORDER's meaning to be inclusive
rather than exclusive, and has fixed a bunch of errors which were
caused by its unintuitive meaning.
- Axel Rasmussen give userfaultfd the UFFDIO_CONTINUE_MODE_WP feature,
which causes minor faults to install a write-protected pte.
- Vlastimil Babka has done some maintenance work on vma_merge():
cleanups to the kernel code and improvements to our userspace test
harness.
- Cleanups to do_fault_around() by Lorenzo Stoakes.
- Mike Rapoport has moved a lot of initialization code out of various
mm/ files and into mm/mm_init.c.
- Lorenzo Stoakes removd vmf_insert_mixed_prot(), which was added for
DRM, but DRM doesn't use it any more.
- Lorenzo has also coverted read_kcore() and vread() to use iterators
and has thereby removed the use of bounce buffers in some cases.
- Lorenzo has also contributed further cleanups of vma_merge().
- Chaitanya Prakash provides some fixes to the mmap selftesting code.
- Matthew Wilcox changes xfs and afs so they no longer take sleeping
locks in ->map_page(), a step towards RCUification of pagefaults.
- Suren Baghdasaryan has improved mmap_lock scalability by switching to
per-VMA locking.
- Frederic Weisbecker has reworked the percpu cache draining so that it
no longer causes latency glitches on cpu isolated workloads.
- Mike Rapoport cleans up and corrects the ARCH_FORCE_MAX_ORDER Kconfig
logic.
- Liu Shixin has changed zswap's initialization so we no longer waste a
chunk of memory if zswap is not being used.
- Yosry Ahmed has improved the performance of memcg statistics
flushing.
- David Stevens has fixed several issues involving khugepaged,
userfaultfd and shmem.
- Christoph Hellwig has provided some cleanup work to zram's IO-related
code paths.
- David Hildenbrand has fixed up some issues in the selftest code's
testing of our pte state changing.
- Pankaj Raghav has made page_endio() unneeded and has removed it.
- Peter Xu contributed some rationalizations of the userfaultfd
selftests.
- Yosry Ahmed has fixed an issue around memcg's page recalim
accounting.
- Chaitanya Prakash has fixed some arm-related issues in the
selftests/mm code.
- Longlong Xia has improved the way in which KSM handles hwpoisoned
pages.
- Peter Xu fixes a few issues with uffd-wp at fork() time.
- Stefan Roesch has changed KSM so that it may now be used on a
per-process and per-cgroup basis.
* tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (369 commits)
mm,unmap: avoid flushing TLB in batch if PTE is inaccessible
shmem: restrict noswap option to initial user namespace
mm/khugepaged: fix conflicting mods to collapse_file()
sparse: remove unnecessary 0 values from rc
mm: move 'mmap_min_addr' logic from callers into vm_unmapped_area()
hugetlb: pte_alloc_huge() to replace huge pte_alloc_map()
maple_tree: fix allocation in mas_sparse_area()
mm: do not increment pgfault stats when page fault handler retries
zsmalloc: allow only one active pool compaction context
selftests/mm: add new selftests for KSM
mm: add new KSM process and sysfs knobs
mm: add new api to enable ksm per process
mm: shrinkers: fix debugfs file permissions
mm: don't check VMA write permissions if the PTE/PMD indicates write permissions
migrate_pages_batch: fix statistics for longterm pin retry
userfaultfd: use helper function range_in_vma()
lib/show_mem.c: use for_each_populated_zone() simplify code
mm: correct arg in reclaim_pages()/reclaim_clean_pages_from_list()
fs/buffer: convert create_page_buffers to folio_create_buffers
fs/buffer: add folio_create_empty_buffers helper
...
2023-04-28 10:42:02 +08:00
|
|
|
if (IS_ERR(folio))
|
|
|
|
return ERR_CAST(folio);
|
fs-verity: implement readahead of Merkle tree pages
When fs-verity verifies data pages, currently it reads each Merkle tree
page synchronously using read_mapping_page().
Therefore, when the Merkle tree pages aren't already cached, fs-verity
causes an extra 4 KiB I/O request for every 512 KiB of data (assuming
that the Merkle tree uses SHA-256 and 4 KiB blocks). This results in
more I/O requests and performance loss than is strictly necessary.
Therefore, implement readahead of the Merkle tree pages.
For simplicity, we take advantage of the fact that the kernel already
does readahead of the file's *data*, just like it does for any other
file. Due to this, we don't really need a separate readahead state
(struct file_ra_state) just for the Merkle tree, but rather we just need
to piggy-back on the existing data readahead requests.
We also only really need to bother with the first level of the Merkle
tree, since the usual fan-out factor is 128, so normally over 99% of
Merkle tree I/O requests are for the first level.
Therefore, make fsverity_verify_bio() enable readahead of the first
Merkle tree level, for up to 1/4 the number of pages in the bio, when it
sees that the REQ_RAHEAD flag is set on the bio. The readahead size is
then passed down to ->read_merkle_tree_page() for the filesystem to
(optionally) implement if it sees that the requested page is uncached.
While we're at it, also make build_merkle_tree_level() set the Merkle
tree readahead size, since it's easy to do there.
However, for now don't set the readahead size in fsverity_verify_page(),
since currently it's only used to verify holes on ext4 and f2fs, and it
would need parameters added to know how much to read ahead.
This patch significantly improves fs-verity sequential read performance.
Some quick benchmarks with 'cat'-ing a 250MB file after dropping caches:
On an ARM64 phone (using sha256-ce):
Before: 217 MB/s
After: 263 MB/s
(compare to sha256sum of non-verity file: 357 MB/s)
In an x86_64 VM (using sha256-avx2):
Before: 173 MB/s
After: 215 MB/s
(compare to sha256sum of non-verity file: 223 MB/s)
Link: https://lore.kernel.org/r/20200106205533.137005-1-ebiggers@kernel.org
Reviewed-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2020-01-07 04:55:33 +08:00
|
|
|
}
|
2023-03-25 02:01:29 +08:00
|
|
|
return folio_file_page(folio, index);
|
2019-07-23 00:26:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf,
|
2022-12-15 06:43:04 +08:00
|
|
|
u64 pos, unsigned int size)
|
2019-07-23 00:26:24 +08:00
|
|
|
{
|
2022-12-15 06:43:04 +08:00
|
|
|
pos += ext4_verity_metadata_pos(inode);
|
2019-07-23 00:26:24 +08:00
|
|
|
|
2022-12-15 06:43:04 +08:00
|
|
|
return pagecache_write(inode, buf, size, pos);
|
2019-07-23 00:26:24 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
const struct fsverity_operations ext4_verityops = {
|
|
|
|
.begin_enable_verity = ext4_begin_enable_verity,
|
|
|
|
.end_enable_verity = ext4_end_enable_verity,
|
|
|
|
.get_verity_descriptor = ext4_get_verity_descriptor,
|
|
|
|
.read_merkle_tree_page = ext4_read_merkle_tree_page,
|
|
|
|
.write_merkle_tree_block = ext4_write_merkle_tree_block,
|
|
|
|
};
|