Changes since last update:

- Add SEEK_{DATA,HOLE} support;
 
  - Free redundant pclusters if no cached compressed data is valid;
 
  - Add sysfs entry to drop internal caches;
 
  - Several bugfixes & cleanups.
 -----BEGIN PGP SIGNATURE-----
 
 iQJFBAABCgAvFiEEQ0A6bDUS9Y+83NPFUXZn5Zlu5qoFAmc8nosRHHhpYW5nQGtl
 cm5lbC5vcmcACgkQUXZn5Zlu5qoxDg/+JvYJjf0xXzCrSaIoSq7Xsfw0d1VLMHqP
 /ZSNCvixoUnkVouPNSR2HaBlX7xhiVSXaaAFuzAOM3Zf7UO/delVcwQNMeYYNTC2
 Wk3ndMZR0C7jP6CpSRgU7of/KOO+W+8levFMJa/6TJCKb3IKgqrU894aiDNEFYMN
 LaZjP42rFmX0MLahAt5byqbKWMfr7YrGkcCxp8agQ3wFe+ssRXO0jU8MEflDk0zV
 5Ar09HxzVSt5MRdvByPhcE66A4RBmdc71o77mN7uCFDxE8MdOxUjgyKBtcstYfUV
 A2ewHhTNt5zt9QaEA1NBK8mzhj2oAHjO8DsIZYgCyARLBvUk8RMeu6LGeu339Qjs
 n8nqpAg6v3rSCGVygBWNqovXaCzynhp3pt1Jh2463BAJXQc71pksPBXxbbzJgQVs
 O7pNJ1H+uQku/B0FfUCOAn1QgInWoGXMalZsmbh7ar9p0XLiVbXtNtUrrXtf0Ehd
 Px5OGmx3Cm3SwyGS8X18o4PKN3fAwc6Ff/C0EbJS+yl0lI0uN4IoVHywhabg9Njx
 TstfHBwz35emO6jI11XkfPgbrRLpwehWWTSu5yLTEpOC8jhsYXtUIEB7KgIz0llG
 IyFFWpnILUxEDkHvQoluHmTc9sACAAAG27anIIao5gIvhua43N0rGeara4BtD+Ni
 J3ZgND6yFA4=
 =CRuc
 -----END PGP SIGNATURE-----

Merge tag 'erofs-for-6.13-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs

Pull erofs updates from Gao Xiang:
 "There is no outstanding feature for this cycle. The most useful
  changes are SEEK_{DATA,HOLE} support and some decompression
  micro-optimization. Other than those, there are some bugfixes and
  cleanups as usual:

   - Add SEEK_{DATA,HOLE} support

   - Free redundant pclusters if no cached compressed data is valid

   - Add sysfs entry to drop internal caches

   - Several bugfixes & cleanups"

* tag 'erofs-for-6.13-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs:
  erofs: handle NONHEAD !delta[1] lclusters gracefully
  erofs: clarify direct I/O support
  erofs: fix blksize < PAGE_SIZE for file-backed mounts
  erofs: get rid of `buf->kmap_type`
  erofs: fix file-backed mounts over FUSE
  erofs: simplify definition of the log functions
  erofs: add sysfs node to drop internal caches
  erofs: free pclusters if no cached folio is attached
  erofs: sunset `struct erofs_workgroup`
  erofs: move erofs_workgroup operations into zdata.c
  erofs: get rid of erofs_{find,insert}_workgroup
  erofs: add SEEK_{DATA,HOLE} support
This commit is contained in:
Linus Torvalds 2024-11-21 09:17:33 -08:00
commit 90a19b744d
9 changed files with 276 additions and 296 deletions

View File

@ -16,3 +16,14 @@ Description: Control strategy of sync decompression:
readahead on atomic contexts only.
- 1 (force on): enable for readpage and readahead.
- 2 (force off): disable for all situations.
What: /sys/fs/erofs/<disk>/drop_caches
Date: November 2024
Contact: "Guo Chunhai" <guochunhai@vivo.com>
Description: Writing to this will drop compression-related caches,
currently used to drop in-memory pclusters and cached
compressed folios:
- 1 : invalidate cached compressed folios
- 2 : drop in-memory pclusters
- 3 : drop in-memory pclusters and cached compressed folios

View File

@ -10,10 +10,10 @@
void erofs_unmap_metabuf(struct erofs_buf *buf)
{
if (buf->kmap_type == EROFS_KMAP)
kunmap_local(buf->base);
if (!buf->base)
return;
kunmap_local(buf->base);
buf->base = NULL;
buf->kmap_type = EROFS_NO_KMAP;
}
void erofs_put_metabuf(struct erofs_buf *buf)
@ -38,20 +38,13 @@ void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset,
}
if (!folio || !folio_contains(folio, index)) {
erofs_put_metabuf(buf);
folio = read_mapping_folio(buf->mapping, index, NULL);
folio = read_mapping_folio(buf->mapping, index, buf->file);
if (IS_ERR(folio))
return folio;
}
buf->page = folio_file_page(folio, index);
if (buf->kmap_type == EROFS_NO_KMAP) {
if (type == EROFS_KMAP)
buf->base = kmap_local_page(buf->page);
buf->kmap_type = type;
} else if (buf->kmap_type != type) {
DBG_BUGON(1);
return ERR_PTR(-EFAULT);
}
if (!buf->base && type == EROFS_KMAP)
buf->base = kmap_local_page(buf->page);
if (type == EROFS_NO_KMAP)
return NULL;
return buf->base + (offset & ~PAGE_MASK);
@ -61,9 +54,11 @@ void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
if (erofs_is_fileio_mode(sbi))
buf->mapping = file_inode(sbi->fdev)->i_mapping;
else if (erofs_is_fscache_mode(sb))
buf->file = NULL;
if (erofs_is_fileio_mode(sbi)) {
buf->file = sbi->fdev; /* some fs like FUSE needs it */
buf->mapping = buf->file->f_mapping;
} else if (erofs_is_fscache_mode(sb))
buf->mapping = sbi->s_fscache->inode->i_mapping;
else
buf->mapping = sb->s_bdev->bd_mapping;
@ -350,7 +345,6 @@ static int erofs_iomap_end(struct inode *inode, loff_t pos, loff_t length,
struct erofs_buf buf = {
.page = kmap_to_page(ptr),
.base = ptr,
.kmap_type = EROFS_KMAP,
};
DBG_BUGON(iomap->type != IOMAP_INLINE);
@ -411,22 +405,9 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (IS_DAX(inode))
return dax_iomap_rw(iocb, to, &erofs_iomap_ops);
#endif
if (iocb->ki_flags & IOCB_DIRECT) {
struct block_device *bdev = inode->i_sb->s_bdev;
unsigned int blksize_mask;
if (bdev)
blksize_mask = bdev_logical_block_size(bdev) - 1;
else
blksize_mask = i_blocksize(inode) - 1;
if ((iocb->ki_pos | iov_iter_count(to) |
iov_iter_alignment(to)) & blksize_mask)
return -EINVAL;
if ((iocb->ki_flags & IOCB_DIRECT) && inode->i_sb->s_bdev)
return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
NULL, 0, NULL, 0);
}
return filemap_read(iocb, to, 0);
}
@ -473,8 +454,32 @@ static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma)
#define erofs_file_mmap generic_file_readonly_mmap
#endif
static loff_t erofs_file_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
const struct iomap_ops *ops = &erofs_iomap_ops;
if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout))
#ifdef CONFIG_EROFS_FS_ZIP
ops = &z_erofs_iomap_report_ops;
#else
return generic_file_llseek(file, offset, whence);
#endif
if (whence == SEEK_HOLE)
offset = iomap_seek_hole(inode, offset, ops);
else if (whence == SEEK_DATA)
offset = iomap_seek_data(inode, offset, ops);
else
return generic_file_llseek(file, offset, whence);
if (offset < 0)
return offset;
return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
}
const struct file_operations erofs_file_fops = {
.llseek = generic_file_llseek,
.llseek = erofs_file_llseek,
.read_iter = erofs_file_read_iter,
.mmap = erofs_file_mmap,
.get_unmapped_area = thp_get_unmapped_area,

View File

@ -318,6 +318,7 @@ int erofs_getattr(struct mnt_idmap *idmap, const struct path *path,
unsigned int query_flags)
{
struct inode *const inode = d_inode(path->dentry);
struct block_device *bdev = inode->i_sb->s_bdev;
bool compressed =
erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout);
@ -330,15 +331,14 @@ int erofs_getattr(struct mnt_idmap *idmap, const struct path *path,
/*
* Return the DIO alignment restrictions if requested.
*
* In EROFS, STATX_DIOALIGN is not supported in ondemand mode and
* compressed files, so in these cases we report no DIO support.
* In EROFS, STATX_DIOALIGN is only supported in bdev-based mode
* and uncompressed inodes, otherwise we report no DIO support.
*/
if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) {
stat->result_mask |= STATX_DIOALIGN;
if (!erofs_is_fscache_mode(inode->i_sb) && !compressed) {
stat->dio_mem_align =
bdev_logical_block_size(inode->i_sb->s_bdev);
stat->dio_offset_align = stat->dio_mem_align;
if (bdev && !compressed) {
stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
stat->dio_offset_align = bdev_logical_block_size(bdev);
}
}
generic_fillattr(idmap, request_mask, inode, stat);

View File

@ -20,18 +20,12 @@
#include <linux/iomap.h>
#include "erofs_fs.h"
/* redefine pr_fmt "erofs: " */
#undef pr_fmt
#define pr_fmt(fmt) "erofs: " fmt
__printf(3, 4) void _erofs_err(struct super_block *sb,
const char *function, const char *fmt, ...);
__printf(2, 3) void _erofs_printk(struct super_block *sb, const char *fmt, ...);
#define erofs_err(sb, fmt, ...) \
_erofs_err(sb, __func__, fmt "\n", ##__VA_ARGS__)
__printf(3, 4) void _erofs_info(struct super_block *sb,
const char *function, const char *fmt, ...);
_erofs_printk(sb, KERN_ERR fmt "\n", ##__VA_ARGS__)
#define erofs_info(sb, fmt, ...) \
_erofs_info(sb, __func__, fmt "\n", ##__VA_ARGS__)
_erofs_printk(sb, KERN_INFO fmt "\n", ##__VA_ARGS__)
#ifdef CONFIG_EROFS_FS_DEBUG
#define DBG_BUGON BUG_ON
#else
@ -208,12 +202,6 @@ enum {
EROFS_ZIP_CACHE_READAROUND
};
/* basic unit of the workstation of a super_block */
struct erofs_workgroup {
pgoff_t index;
struct lockref lockref;
};
enum erofs_kmap_type {
EROFS_NO_KMAP, /* don't map the buffer */
EROFS_KMAP, /* use kmap_local_page() to map the buffer */
@ -221,9 +209,9 @@ enum erofs_kmap_type {
struct erofs_buf {
struct address_space *mapping;
struct file *file;
struct page *page;
void *base;
enum erofs_kmap_type kmap_type;
};
#define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL })
@ -456,20 +444,17 @@ static inline void erofs_pagepool_add(struct page **pagepool, struct page *page)
void erofs_release_pages(struct page **pagepool);
#ifdef CONFIG_EROFS_FS_ZIP
void erofs_workgroup_put(struct erofs_workgroup *grp);
struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
pgoff_t index);
struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
struct erofs_workgroup *grp);
void erofs_workgroup_free_rcu(struct erofs_workgroup *grp);
#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
extern atomic_long_t erofs_global_shrink_cnt;
void erofs_shrinker_register(struct super_block *sb);
void erofs_shrinker_unregister(struct super_block *sb);
int __init erofs_init_shrinker(void);
void erofs_exit_shrinker(void);
int __init z_erofs_init_subsystem(void);
void z_erofs_exit_subsystem(void);
int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi,
struct erofs_workgroup *egrp);
unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi,
unsigned long nr_shrink);
int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
int flags);
void *z_erofs_get_gbuf(unsigned int requiredpages);

View File

@ -18,37 +18,22 @@
static struct kmem_cache *erofs_inode_cachep __read_mostly;
void _erofs_err(struct super_block *sb, const char *func, const char *fmt, ...)
void _erofs_printk(struct super_block *sb, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
int level;
va_start(args, fmt);
vaf.fmt = fmt;
level = printk_get_level(fmt);
vaf.fmt = printk_skip_level(fmt);
vaf.va = &args;
if (sb)
pr_err("(device %s): %s: %pV", sb->s_id, func, &vaf);
printk("%c%cerofs (device %s): %pV",
KERN_SOH_ASCII, level, sb->s_id, &vaf);
else
pr_err("%s: %pV", func, &vaf);
va_end(args);
}
void _erofs_info(struct super_block *sb, const char *func, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
if (sb)
pr_info("(device %s): %pV", sb->s_id, &vaf);
else
pr_info("%pV", &vaf);
printk("%c%cerofs: %pV", KERN_SOH_ASCII, level, &vaf);
va_end(args);
}
@ -631,7 +616,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
errorfc(fc, "unsupported blksize for fscache mode");
return -EINVAL;
}
if (!sb_set_blocksize(sb, 1 << sbi->blkszbits)) {
if (erofs_is_fileio_mode(sbi)) {
sb->s_blocksize = 1 << sbi->blkszbits;
sb->s_blocksize_bits = sbi->blkszbits;
} else if (!sb_set_blocksize(sb, 1 << sbi->blkszbits)) {
errorfc(fc, "failed to set erofs blksize");
return -EINVAL;
}

View File

@ -10,6 +10,7 @@
enum {
attr_feature,
attr_drop_caches,
attr_pointer_ui,
attr_pointer_bool,
};
@ -57,11 +58,13 @@ static struct erofs_attr erofs_attr_##_name = { \
#ifdef CONFIG_EROFS_FS_ZIP
EROFS_ATTR_RW_UI(sync_decompress, erofs_mount_opts);
EROFS_ATTR_FUNC(drop_caches, 0200);
#endif
static struct attribute *erofs_attrs[] = {
#ifdef CONFIG_EROFS_FS_ZIP
ATTR_LIST(sync_decompress),
ATTR_LIST(drop_caches),
#endif
NULL,
};
@ -163,6 +166,20 @@ static ssize_t erofs_attr_store(struct kobject *kobj, struct attribute *attr,
return -EINVAL;
*(bool *)ptr = !!t;
return len;
#ifdef CONFIG_EROFS_FS_ZIP
case attr_drop_caches:
ret = kstrtoul(skip_spaces(buf), 0, &t);
if (ret)
return ret;
if (t < 1 || t > 3)
return -EINVAL;
if (t & 2)
z_erofs_shrink_scan(sbi, ~0UL);
if (t & 1)
invalidate_mapping_pages(MNGD_MAPPING(sbi), 0, -1);
return len;
#endif
}
return 0;
}

View File

@ -44,12 +44,15 @@ __Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS);
* A: Field should be accessed / updated in atomic for parallelized code.
*/
struct z_erofs_pcluster {
struct erofs_workgroup obj;
struct mutex lock;
struct lockref lockref;
/* A: point to next chained pcluster or TAILs */
z_erofs_next_pcluster_t next;
/* I: start block address of this pcluster */
erofs_off_t index;
/* L: the maximum decompression size of this round */
unsigned int length;
@ -108,7 +111,7 @@ struct z_erofs_decompressqueue {
static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl)
{
return !pcl->obj.index;
return !pcl->index;
}
static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
@ -116,7 +119,6 @@ static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT;
}
#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo)
{
return fo->mapping == MNGD_MAPPING(sbi);
@ -548,7 +550,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
if (READ_ONCE(pcl->compressed_bvecs[i].page))
continue;
page = find_get_page(mc, pcl->obj.index + i);
page = find_get_page(mc, pcl->index + i);
if (!page) {
/* I/O is needed, no possible to decompress directly */
standalone = false;
@ -564,13 +566,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
continue;
set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
}
spin_lock(&pcl->obj.lockref.lock);
spin_lock(&pcl->lockref.lock);
if (!pcl->compressed_bvecs[i].page) {
pcl->compressed_bvecs[i].page = page ? page : newpage;
spin_unlock(&pcl->obj.lockref.lock);
spin_unlock(&pcl->lockref.lock);
continue;
}
spin_unlock(&pcl->obj.lockref.lock);
spin_unlock(&pcl->lockref.lock);
if (page)
put_page(page);
@ -587,11 +589,9 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
}
/* (erofs_shrinker) disconnect cached encoded data with pclusters */
int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi,
struct erofs_workgroup *grp)
static int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi,
struct z_erofs_pcluster *pcl)
{
struct z_erofs_pcluster *const pcl =
container_of(grp, struct z_erofs_pcluster, obj);
unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
struct folio *folio;
int i;
@ -626,8 +626,8 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
return true;
ret = false;
spin_lock(&pcl->obj.lockref.lock);
if (pcl->obj.lockref.count <= 0) {
spin_lock(&pcl->lockref.lock);
if (pcl->lockref.count <= 0) {
DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
for (; bvec < end; ++bvec) {
if (bvec->page && page_folio(bvec->page) == folio) {
@ -638,7 +638,7 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
}
}
}
spin_unlock(&pcl->obj.lockref.lock);
spin_unlock(&pcl->lockref.lock);
return ret;
}
@ -689,15 +689,15 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
if (exclusive) {
/* give priority for inplaceio to use file pages first */
spin_lock(&pcl->obj.lockref.lock);
spin_lock(&pcl->lockref.lock);
while (fe->icur > 0) {
if (pcl->compressed_bvecs[--fe->icur].page)
continue;
pcl->compressed_bvecs[fe->icur] = *bvec;
spin_unlock(&pcl->obj.lockref.lock);
spin_unlock(&pcl->lockref.lock);
return 0;
}
spin_unlock(&pcl->obj.lockref.lock);
spin_unlock(&pcl->lockref.lock);
/* otherwise, check if it can be used as a bvpage */
if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
@ -710,13 +710,30 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
return ret;
}
static bool z_erofs_get_pcluster(struct z_erofs_pcluster *pcl)
{
if (lockref_get_not_zero(&pcl->lockref))
return true;
spin_lock(&pcl->lockref.lock);
if (__lockref_is_dead(&pcl->lockref)) {
spin_unlock(&pcl->lockref.lock);
return false;
}
if (!pcl->lockref.count++)
atomic_long_dec(&erofs_global_shrink_cnt);
spin_unlock(&pcl->lockref.lock);
return true;
}
static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
{
struct erofs_map_blocks *map = &fe->map;
struct super_block *sb = fe->inode->i_sb;
struct erofs_sb_info *sbi = EROFS_SB(sb);
bool ztailpacking = map->m_flags & EROFS_MAP_META;
struct z_erofs_pcluster *pcl;
struct erofs_workgroup *grp;
struct z_erofs_pcluster *pcl, *pre;
int err;
if (!(map->m_flags & EROFS_MAP_ENCODED) ||
@ -730,8 +747,8 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
if (IS_ERR(pcl))
return PTR_ERR(pcl);
spin_lock_init(&pcl->obj.lockref.lock);
pcl->obj.lockref.count = 1; /* one ref for this request */
spin_lock_init(&pcl->lockref.lock);
pcl->lockref.count = 1; /* one ref for this request */
pcl->algorithmformat = map->m_algorithmformat;
pcl->length = 0;
pcl->partial = true;
@ -749,19 +766,26 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
DBG_BUGON(!mutex_trylock(&pcl->lock));
if (ztailpacking) {
pcl->obj.index = 0; /* which indicates ztailpacking */
pcl->index = 0; /* which indicates ztailpacking */
} else {
pcl->obj.index = erofs_blknr(sb, map->m_pa);
grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj);
if (IS_ERR(grp)) {
err = PTR_ERR(grp);
goto err_out;
pcl->index = erofs_blknr(sb, map->m_pa);
while (1) {
xa_lock(&sbi->managed_pslots);
pre = __xa_cmpxchg(&sbi->managed_pslots, pcl->index,
NULL, pcl, GFP_KERNEL);
if (!pre || xa_is_err(pre) || z_erofs_get_pcluster(pre)) {
xa_unlock(&sbi->managed_pslots);
break;
}
/* try to legitimize the current in-tree one */
xa_unlock(&sbi->managed_pslots);
cond_resched();
}
if (grp != &pcl->obj) {
fe->pcl = container_of(grp,
struct z_erofs_pcluster, obj);
if (xa_is_err(pre)) {
err = xa_err(pre);
goto err_out;
} else if (pre) {
fe->pcl = pre;
err = -EEXIST;
goto err_out;
}
@ -781,7 +805,7 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
struct erofs_map_blocks *map = &fe->map;
struct super_block *sb = fe->inode->i_sb;
erofs_blk_t blknr = erofs_blknr(sb, map->m_pa);
struct erofs_workgroup *grp = NULL;
struct z_erofs_pcluster *pcl = NULL;
int ret;
DBG_BUGON(fe->pcl);
@ -789,14 +813,23 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
if (!(map->m_flags & EROFS_MAP_META)) {
grp = erofs_find_workgroup(sb, blknr);
while (1) {
rcu_read_lock();
pcl = xa_load(&EROFS_SB(sb)->managed_pslots, blknr);
if (!pcl || z_erofs_get_pcluster(pcl)) {
DBG_BUGON(pcl && blknr != pcl->index);
rcu_read_unlock();
break;
}
rcu_read_unlock();
}
} else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
DBG_BUGON(1);
return -EFSCORRUPTED;
}
if (grp) {
fe->pcl = container_of(grp, struct z_erofs_pcluster, obj);
if (pcl) {
fe->pcl = pcl;
ret = -EEXIST;
} else {
ret = z_erofs_register_pcluster(fe);
@ -851,12 +884,87 @@ static void z_erofs_rcu_callback(struct rcu_head *head)
struct z_erofs_pcluster, rcu));
}
void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
static bool __erofs_try_to_release_pcluster(struct erofs_sb_info *sbi,
struct z_erofs_pcluster *pcl)
{
struct z_erofs_pcluster *const pcl =
container_of(grp, struct z_erofs_pcluster, obj);
if (pcl->lockref.count)
return false;
call_rcu(&pcl->rcu, z_erofs_rcu_callback);
/*
* Note that all cached folios should be detached before deleted from
* the XArray. Otherwise some folios could be still attached to the
* orphan old pcluster when the new one is available in the tree.
*/
if (erofs_try_to_free_all_cached_folios(sbi, pcl))
return false;
/*
* It's impossible to fail after the pcluster is freezed, but in order
* to avoid some race conditions, add a DBG_BUGON to observe this.
*/
DBG_BUGON(__xa_erase(&sbi->managed_pslots, pcl->index) != pcl);
lockref_mark_dead(&pcl->lockref);
return true;
}
static bool erofs_try_to_release_pcluster(struct erofs_sb_info *sbi,
struct z_erofs_pcluster *pcl)
{
bool free;
spin_lock(&pcl->lockref.lock);
free = __erofs_try_to_release_pcluster(sbi, pcl);
spin_unlock(&pcl->lockref.lock);
if (free) {
atomic_long_dec(&erofs_global_shrink_cnt);
call_rcu(&pcl->rcu, z_erofs_rcu_callback);
}
return free;
}
unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi,
unsigned long nr_shrink)
{
struct z_erofs_pcluster *pcl;
unsigned int freed = 0;
unsigned long index;
xa_lock(&sbi->managed_pslots);
xa_for_each(&sbi->managed_pslots, index, pcl) {
/* try to shrink each valid pcluster */
if (!erofs_try_to_release_pcluster(sbi, pcl))
continue;
xa_unlock(&sbi->managed_pslots);
++freed;
if (!--nr_shrink)
return freed;
xa_lock(&sbi->managed_pslots);
}
xa_unlock(&sbi->managed_pslots);
return freed;
}
static void z_erofs_put_pcluster(struct erofs_sb_info *sbi,
struct z_erofs_pcluster *pcl, bool try_free)
{
bool free = false;
if (lockref_put_or_lock(&pcl->lockref))
return;
DBG_BUGON(__lockref_is_dead(&pcl->lockref));
if (!--pcl->lockref.count) {
if (try_free && xa_trylock(&sbi->managed_pslots)) {
free = __erofs_try_to_release_pcluster(sbi, pcl);
xa_unlock(&sbi->managed_pslots);
}
atomic_long_add(!free, &erofs_global_shrink_cnt);
}
spin_unlock(&pcl->lockref.lock);
if (free)
call_rcu(&pcl->rcu, z_erofs_rcu_callback);
}
static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe)
@ -877,7 +985,7 @@ static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe)
* any longer if the pcluster isn't hosted by ourselves.
*/
if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE)
erofs_workgroup_put(&pcl->obj);
z_erofs_put_pcluster(EROFS_I_SB(fe->inode), pcl, false);
fe->pcl = NULL;
}
@ -1179,6 +1287,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
int i, j, jtop, err2;
struct page *page;
bool overlapped;
bool try_free = true;
mutex_lock(&pcl->lock);
be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT;
@ -1236,9 +1345,12 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
/* managed folios are still left in compressed_bvecs[] */
for (i = 0; i < pclusterpages; ++i) {
page = be->compressed_pages[i];
if (!page ||
erofs_folio_is_managed(sbi, page_folio(page)))
if (!page)
continue;
if (erofs_folio_is_managed(sbi, page_folio(page))) {
try_free = false;
continue;
}
(void)z_erofs_put_shortlivedpage(be->pagepool, page);
WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
}
@ -1284,6 +1396,11 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
/* pcluster lock MUST be taken before the following line */
WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL);
mutex_unlock(&pcl->lock);
if (z_erofs_is_inline_pcluster(pcl))
z_erofs_free_pcluster(pcl);
else
z_erofs_put_pcluster(sbi, pcl, try_free);
return err;
}
@ -1306,10 +1423,6 @@ static int z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
owned = READ_ONCE(be.pcl->next);
err = z_erofs_decompress_pcluster(&be, err) ?: err;
if (z_erofs_is_inline_pcluster(be.pcl))
z_erofs_free_pcluster(be.pcl);
else
erofs_workgroup_put(&be.pcl->obj);
}
return err;
}
@ -1391,9 +1504,9 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
bvec->bv_offset = 0;
bvec->bv_len = PAGE_SIZE;
repeat:
spin_lock(&pcl->obj.lockref.lock);
spin_lock(&pcl->lockref.lock);
zbv = pcl->compressed_bvecs[nr];
spin_unlock(&pcl->obj.lockref.lock);
spin_unlock(&pcl->lockref.lock);
if (!zbv.page)
goto out_allocfolio;
@ -1455,23 +1568,23 @@ repeat:
folio_put(folio);
out_allocfolio:
page = __erofs_allocpage(&f->pagepool, gfp, true);
spin_lock(&pcl->obj.lockref.lock);
spin_lock(&pcl->lockref.lock);
if (unlikely(pcl->compressed_bvecs[nr].page != zbv.page)) {
if (page)
erofs_pagepool_add(&f->pagepool, page);
spin_unlock(&pcl->obj.lockref.lock);
spin_unlock(&pcl->lockref.lock);
cond_resched();
goto repeat;
}
pcl->compressed_bvecs[nr].page = page ? page : ERR_PTR(-ENOMEM);
spin_unlock(&pcl->obj.lockref.lock);
spin_unlock(&pcl->lockref.lock);
bvec->bv_page = page;
if (!page)
return;
folio = page_folio(page);
out_tocache:
if (!tocache || bs != PAGE_SIZE ||
filemap_add_folio(mc, folio, pcl->obj.index + nr, gfp)) {
filemap_add_folio(mc, folio, pcl->index + nr, gfp)) {
/* turn into a temporary shortlived folio (1 ref) */
folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE;
return;
@ -1603,7 +1716,7 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
/* no device id here, thus it will always succeed */
mdev = (struct erofs_map_dev) {
.m_pa = erofs_pos(sb, pcl->obj.index),
.m_pa = erofs_pos(sb, pcl->index),
};
(void)erofs_map_dev(sb, &mdev);

View File

@ -219,7 +219,7 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m,
unsigned int amortizedshift;
erofs_off_t pos;
if (lcn >= totalidx)
if (lcn >= totalidx || vi->z_logical_clusterbits > 14)
return -EINVAL;
m->lcn = lcn;
@ -390,7 +390,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
u64 lcn = m->lcn, headlcn = map->m_la >> lclusterbits;
int err;
do {
while (1) {
/* handle the last EOF pcluster (no next HEAD lcluster) */
if ((lcn << lclusterbits) >= inode->i_size) {
map->m_llen = inode->i_size - map->m_la;
@ -402,14 +402,16 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
return err;
if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
DBG_BUGON(!m->delta[1] &&
m->clusterofs != 1 << lclusterbits);
/* work around invalid d1 generated by pre-1.0 mkfs */
if (unlikely(!m->delta[1])) {
m->delta[1] = 1;
DBG_BUGON(1);
}
} else if (m->type == Z_EROFS_LCLUSTER_TYPE_PLAIN ||
m->type == Z_EROFS_LCLUSTER_TYPE_HEAD1 ||
m->type == Z_EROFS_LCLUSTER_TYPE_HEAD2) {
/* go on until the next HEAD lcluster */
if (lcn != headlcn)
break;
break; /* ends at the next HEAD lcluster */
m->delta[1] = 1;
} else {
erofs_err(inode->i_sb, "unknown type %u @ lcn %llu of nid %llu",
@ -418,8 +420,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
return -EOPNOTSUPP;
}
lcn += m->delta[1];
} while (m->delta[1]);
}
map->m_llen = (lcn << lclusterbits) + m->clusterofs - map->m_la;
return 0;
}

View File

@ -2,6 +2,7 @@
/*
* Copyright (C) 2018 HUAWEI, Inc.
* https://www.huawei.com/
* Copyright (C) 2024 Alibaba Cloud
*/
#include "internal.h"
@ -19,13 +20,12 @@ static unsigned int z_erofs_gbuf_count, z_erofs_gbuf_nrpages,
module_param_named(global_buffers, z_erofs_gbuf_count, uint, 0444);
module_param_named(reserved_pages, z_erofs_rsv_nrpages, uint, 0444);
static atomic_long_t erofs_global_shrink_cnt; /* for all mounted instances */
/* protected by 'erofs_sb_list_lock' */
static unsigned int shrinker_run_no;
atomic_long_t erofs_global_shrink_cnt; /* for all mounted instances */
/* protects the mounted 'erofs_sb_list' */
/* protects `erofs_sb_list_lock` and the mounted `erofs_sb_list` */
static DEFINE_SPINLOCK(erofs_sb_list_lock);
static LIST_HEAD(erofs_sb_list);
static unsigned int shrinker_run_no;
static struct shrinker *erofs_shrinker_info;
static unsigned int z_erofs_gbuf_id(void)
@ -214,145 +214,6 @@ void erofs_release_pages(struct page **pagepool)
}
}
static bool erofs_workgroup_get(struct erofs_workgroup *grp)
{
if (lockref_get_not_zero(&grp->lockref))
return true;
spin_lock(&grp->lockref.lock);
if (__lockref_is_dead(&grp->lockref)) {
spin_unlock(&grp->lockref.lock);
return false;
}
if (!grp->lockref.count++)
atomic_long_dec(&erofs_global_shrink_cnt);
spin_unlock(&grp->lockref.lock);
return true;
}
struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
pgoff_t index)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
struct erofs_workgroup *grp;
repeat:
rcu_read_lock();
grp = xa_load(&sbi->managed_pslots, index);
if (grp) {
if (!erofs_workgroup_get(grp)) {
/* prefer to relax rcu read side */
rcu_read_unlock();
goto repeat;
}
DBG_BUGON(index != grp->index);
}
rcu_read_unlock();
return grp;
}
struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
struct erofs_workgroup *grp)
{
struct erofs_sb_info *const sbi = EROFS_SB(sb);
struct erofs_workgroup *pre;
DBG_BUGON(grp->lockref.count < 1);
repeat:
xa_lock(&sbi->managed_pslots);
pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index,
NULL, grp, GFP_KERNEL);
if (pre) {
if (xa_is_err(pre)) {
pre = ERR_PTR(xa_err(pre));
} else if (!erofs_workgroup_get(pre)) {
/* try to legitimize the current in-tree one */
xa_unlock(&sbi->managed_pslots);
cond_resched();
goto repeat;
}
grp = pre;
}
xa_unlock(&sbi->managed_pslots);
return grp;
}
static void __erofs_workgroup_free(struct erofs_workgroup *grp)
{
atomic_long_dec(&erofs_global_shrink_cnt);
erofs_workgroup_free_rcu(grp);
}
void erofs_workgroup_put(struct erofs_workgroup *grp)
{
if (lockref_put_or_lock(&grp->lockref))
return;
DBG_BUGON(__lockref_is_dead(&grp->lockref));
if (grp->lockref.count == 1)
atomic_long_inc(&erofs_global_shrink_cnt);
--grp->lockref.count;
spin_unlock(&grp->lockref.lock);
}
static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
struct erofs_workgroup *grp)
{
int free = false;
spin_lock(&grp->lockref.lock);
if (grp->lockref.count)
goto out;
/*
* Note that all cached pages should be detached before deleted from
* the XArray. Otherwise some cached pages could be still attached to
* the orphan old workgroup when the new one is available in the tree.
*/
if (erofs_try_to_free_all_cached_folios(sbi, grp))
goto out;
/*
* It's impossible to fail after the workgroup is freezed,
* however in order to avoid some race conditions, add a
* DBG_BUGON to observe this in advance.
*/
DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp);
lockref_mark_dead(&grp->lockref);
free = true;
out:
spin_unlock(&grp->lockref.lock);
if (free)
__erofs_workgroup_free(grp);
return free;
}
static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
unsigned long nr_shrink)
{
struct erofs_workgroup *grp;
unsigned int freed = 0;
unsigned long index;
xa_lock(&sbi->managed_pslots);
xa_for_each(&sbi->managed_pslots, index, grp) {
/* try to shrink each valid workgroup */
if (!erofs_try_to_release_workgroup(sbi, grp))
continue;
xa_unlock(&sbi->managed_pslots);
++freed;
if (!--nr_shrink)
return freed;
xa_lock(&sbi->managed_pslots);
}
xa_unlock(&sbi->managed_pslots);
return freed;
}
void erofs_shrinker_register(struct super_block *sb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
@ -369,8 +230,8 @@ void erofs_shrinker_unregister(struct super_block *sb)
struct erofs_sb_info *const sbi = EROFS_SB(sb);
mutex_lock(&sbi->umount_mutex);
/* clean up all remaining workgroups in memory */
erofs_shrink_workstation(sbi, ~0UL);
/* clean up all remaining pclusters in memory */
z_erofs_shrink_scan(sbi, ~0UL);
spin_lock(&erofs_sb_list_lock);
list_del(&sbi->list);
@ -418,9 +279,7 @@ static unsigned long erofs_shrink_scan(struct shrinker *shrink,
spin_unlock(&erofs_sb_list_lock);
sbi->shrinker_run_no = run_no;
freed += erofs_shrink_workstation(sbi, nr - freed);
freed += z_erofs_shrink_scan(sbi, nr - freed);
spin_lock(&erofs_sb_list_lock);
/* Get the next list element before we move this one */
p = p->next;