From 53566cca34de34dceac16363e97e0faa03d2a425 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20G=C3=B6ttsche?= Date: Fri, 26 Apr 2024 18:20:15 +0200 Subject: [PATCH 001/168] fs: rename struct xattr_ctx to kernel_xattr_ctx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename the struct xattr_ctx to increase distinction with the about to be added user API struct xattr_args. No functional change. Suggested-by: Christian Brauner Signed-off-by: Christian Göttsche Link: https://lore.kernel.org/r/20240426162042.191916-2-cgoettsche@seltendoof.de Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/internal.h | 8 ++++---- fs/xattr.c | 10 +++++----- io_uring/xattr.c | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/internal.h b/fs/internal.h index cdd73209eecb..2e69e17012a8 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -267,7 +267,7 @@ struct xattr_name { char name[XATTR_NAME_MAX + 1]; }; -struct xattr_ctx { +struct kernel_xattr_ctx { /* Value of attribute */ union { const void __user *cvalue; @@ -283,11 +283,11 @@ struct xattr_ctx { ssize_t do_getxattr(struct mnt_idmap *idmap, struct dentry *d, - struct xattr_ctx *ctx); + struct kernel_xattr_ctx *ctx); -int setxattr_copy(const char __user *name, struct xattr_ctx *ctx); +int setxattr_copy(const char __user *name, struct kernel_xattr_ctx *ctx); int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, - struct xattr_ctx *ctx); + struct kernel_xattr_ctx *ctx); int may_write_xattr(struct mnt_idmap *idmap, struct inode *inode); #ifdef CONFIG_FS_POSIX_ACL diff --git a/fs/xattr.c b/fs/xattr.c index 7672ce5486c5..a90215c97c9f 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -590,7 +590,7 @@ EXPORT_SYMBOL_GPL(vfs_removexattr); * Extended attribute SET operations */ -int setxattr_copy(const char __user *name, struct xattr_ctx *ctx) +int setxattr_copy(const char __user *name, struct kernel_xattr_ctx *ctx) { int error; @@ -620,7 +620,7 @@ int setxattr_copy(const char __user *name, struct xattr_ctx *ctx) } int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, - struct xattr_ctx *ctx) + struct kernel_xattr_ctx *ctx) { if (is_posix_acl_xattr(ctx->kname->name)) return do_set_acl(idmap, dentry, ctx->kname->name, @@ -635,7 +635,7 @@ static int path_setxattr(const char __user *pathname, size_t size, int flags, unsigned int lookup_flags) { struct xattr_name kname; - struct xattr_ctx ctx = { + struct kernel_xattr_ctx ctx = { .cvalue = value, .kvalue = NULL, .size = size, @@ -720,7 +720,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, */ ssize_t do_getxattr(struct mnt_idmap *idmap, struct dentry *d, - struct xattr_ctx *ctx) + struct kernel_xattr_ctx *ctx) { ssize_t error; char *kname = ctx->kname->name; @@ -755,7 +755,7 @@ getxattr(struct mnt_idmap *idmap, struct dentry *d, { ssize_t error; struct xattr_name kname; - struct xattr_ctx ctx = { + struct kernel_xattr_ctx ctx = { .value = value, .kvalue = NULL, .size = size, diff --git a/io_uring/xattr.c b/io_uring/xattr.c index 6cf41c3bc369..5b4594ede935 100644 --- a/io_uring/xattr.c +++ b/io_uring/xattr.c @@ -18,7 +18,7 @@ struct io_xattr { struct file *file; - struct xattr_ctx ctx; + struct kernel_xattr_ctx ctx; struct filename *filename; }; From bf9883d5779117776c2e25d29d4dfed31d390fb2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 8 Jul 2024 12:11:21 -0400 Subject: [PATCH 002/168] ufs: Convert ufs_get_page() to use a folio Remove a call to read_mapping_page(). Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/ufs/dir.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 61f25d3cf3f7..0705848899c1 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -194,18 +194,19 @@ fail: static struct page *ufs_get_page(struct inode *dir, unsigned long n) { struct address_space *mapping = dir->i_mapping; - struct page *page = read_mapping_page(mapping, n, NULL); - if (!IS_ERR(page)) { - kmap(page); - if (unlikely(!PageChecked(page))) { - if (!ufs_check_page(page)) - goto fail; - } + struct folio *folio = read_mapping_folio(mapping, n, NULL); + + if (IS_ERR(folio)) + return &folio->page; + kmap(&folio->page); + if (unlikely(!folio_test_checked(folio))) { + if (!ufs_check_page(&folio->page)) + goto fail; } - return page; + return &folio->page; fail: - ufs_put_page(page); + ufs_put_page(&folio->page); return ERR_PTR(-EIO); } From 5fe08b1d7e315cb4238ae1afa4a3ec951ae4908e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 8 Jul 2024 14:46:26 -0400 Subject: [PATCH 003/168] ufs: Convert ufs_get_page() to ufs_get_folio() Use the same calling convention as ext2 (see commit 46022375abe8) so that we can transition to kmap_local in a future patch. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/ufs/dir.c | 101 ++++++++++++++++++++++++--------------------------- 1 file changed, 48 insertions(+), 53 deletions(-) diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 0705848899c1..6c3235f426ed 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -191,19 +191,22 @@ fail: return false; } -static struct page *ufs_get_page(struct inode *dir, unsigned long n) +static void *ufs_get_folio(struct inode *dir, unsigned long n, + struct folio **foliop) { struct address_space *mapping = dir->i_mapping; struct folio *folio = read_mapping_folio(mapping, n, NULL); + void *kaddr; if (IS_ERR(folio)) - return &folio->page; - kmap(&folio->page); + return ERR_CAST(folio); + kaddr = kmap(&folio->page); if (unlikely(!folio_test_checked(folio))) { if (!ufs_check_page(&folio->page)) goto fail; } - return &folio->page; + *foliop = folio; + return kaddr; fail: ufs_put_page(&folio->page); @@ -234,14 +237,14 @@ ufs_next_entry(struct super_block *sb, struct ufs_dir_entry *p) struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p) { - struct page *page = ufs_get_page(dir, 0); - struct ufs_dir_entry *de = NULL; + struct folio *folio; + struct ufs_dir_entry *de = ufs_get_folio(dir, 0, &folio); + + if (IS_ERR(de)) + return NULL; + de = ufs_next_entry(dir->i_sb, de); + *p = &folio->page; - if (!IS_ERR(page)) { - de = ufs_next_entry(dir->i_sb, - (struct ufs_dir_entry *)page_address(page)); - *p = page; - } return de; } @@ -262,7 +265,7 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr, unsigned reclen = UFS_DIR_REC_LEN(namelen); unsigned long start, n; unsigned long npages = dir_pages(dir); - struct page *page = NULL; + struct folio *folio; struct ufs_inode_info *ui = UFS_I(dir); struct ufs_dir_entry *de; @@ -280,18 +283,17 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr, start = 0; n = start; do { - char *kaddr; - page = ufs_get_page(dir, n); - if (!IS_ERR(page)) { - kaddr = page_address(page); - de = (struct ufs_dir_entry *) kaddr; + char *kaddr = ufs_get_folio(dir, n, &folio); + + if (!IS_ERR(kaddr)) { + de = (struct ufs_dir_entry *)kaddr; kaddr += ufs_last_byte(dir, n) - reclen; while ((char *) de <= kaddr) { if (ufs_match(sb, namelen, name, de)) goto found; de = ufs_next_entry(sb, de); } - ufs_put_page(page); + ufs_put_page(&folio->page); } if (++n >= npages) n = 0; @@ -300,7 +302,7 @@ out: return NULL; found: - *res_page = page; + *res_page = &folio->page; ui->i_dir_start_lookup = n; return de; } @@ -317,11 +319,10 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode) unsigned reclen = UFS_DIR_REC_LEN(namelen); const unsigned int chunk_size = UFS_SB(sb)->s_uspi->s_dirblksize; unsigned short rec_len, name_len; - struct page *page = NULL; + struct folio *folio = NULL; struct ufs_dir_entry *de; unsigned long npages = dir_pages(dir); unsigned long n; - char *kaddr; loff_t pos; int err; @@ -329,21 +330,19 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode) /* * We take care of directory expansion in the same loop. - * This code plays outside i_size, so it locks the page + * This code plays outside i_size, so it locks the folio * to protect that region. */ for (n = 0; n <= npages; n++) { + char *kaddr = ufs_get_folio(dir, n, &folio); char *dir_end; - page = ufs_get_page(dir, n); - err = PTR_ERR(page); - if (IS_ERR(page)) - goto out; - lock_page(page); - kaddr = page_address(page); + if (IS_ERR(kaddr)) + return PTR_ERR(kaddr); + folio_lock(folio); dir_end = kaddr + ufs_last_byte(dir, n); de = (struct ufs_dir_entry *)kaddr; - kaddr += PAGE_SIZE - reclen; + kaddr += folio_size(folio) - reclen; while ((char *)de <= kaddr) { if ((char *)de == dir_end) { /* We hit i_size */ @@ -370,16 +369,15 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode) goto got_it; de = (struct ufs_dir_entry *) ((char *) de + rec_len); } - unlock_page(page); - ufs_put_page(page); + folio_unlock(folio); + ufs_put_page(&folio->page); } BUG(); return -EINVAL; got_it: - pos = page_offset(page) + - (char*)de - (char*)page_address(page); - err = ufs_prepare_chunk(page, pos, rec_len); + pos = folio_pos(folio) + offset_in_folio(folio, de); + err = ufs_prepare_chunk(&folio->page, pos, rec_len); if (err) goto out_unlock; if (de->d_ino) { @@ -396,18 +394,17 @@ got_it: de->d_ino = cpu_to_fs32(sb, inode->i_ino); ufs_set_de_type(sb, de, inode->i_mode); - ufs_commit_chunk(page, pos, rec_len); + ufs_commit_chunk(&folio->page, pos, rec_len); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); err = ufs_handle_dirsync(dir); /* OFFSET_CACHE */ out_put: - ufs_put_page(page); -out: + ufs_put_page(&folio->page); return err; out_unlock: - unlock_page(page); + folio_unlock(folio); goto out_put; } @@ -445,19 +442,18 @@ ufs_readdir(struct file *file, struct dir_context *ctx) return 0; for ( ; n < npages; n++, offset = 0) { - char *kaddr, *limit; struct ufs_dir_entry *de; + struct folio *folio; + char *kaddr = ufs_get_folio(inode, n, &folio); + char *limit; - struct page *page = ufs_get_page(inode, n); - - if (IS_ERR(page)) { + if (IS_ERR(kaddr)) { ufs_error(sb, __func__, "bad page in #%lu", inode->i_ino); ctx->pos += PAGE_SIZE - offset; - return -EIO; + return PTR_ERR(kaddr); } - kaddr = page_address(page); if (unlikely(need_revalidate)) { if (offset) { offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask); @@ -483,13 +479,13 @@ ufs_readdir(struct file *file, struct dir_context *ctx) ufs_get_de_namlen(sb, de), fs32_to_cpu(sb, de->d_ino), d_type)) { - ufs_put_page(page); + ufs_put_page(&folio->page); return 0; } } ctx->pos += fs16_to_cpu(sb, de->d_reclen); } - ufs_put_page(page); + ufs_put_page(&folio->page); } return 0; } @@ -600,18 +596,17 @@ fail: int ufs_empty_dir(struct inode * inode) { struct super_block *sb = inode->i_sb; - struct page *page = NULL; + struct folio *folio; + char *kaddr; unsigned long i, npages = dir_pages(inode); for (i = 0; i < npages; i++) { - char *kaddr; struct ufs_dir_entry *de; - page = ufs_get_page(inode, i); - if (IS_ERR(page)) + kaddr = ufs_get_folio(inode, i, &folio); + if (IS_ERR(kaddr)) continue; - kaddr = page_address(page); de = (struct ufs_dir_entry *)kaddr; kaddr += ufs_last_byte(inode, i) - UFS_DIR_REC_LEN(1); @@ -638,12 +633,12 @@ int ufs_empty_dir(struct inode * inode) } de = ufs_next_entry(sb, de); } - ufs_put_page(page); + ufs_put_page(&folio->page); } return 1; not_empty: - ufs_put_page(page); + ufs_put_page(&folio->page); return 0; } From a60b0e8f150ff5e9fefaf9166b75381f162da45d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 8 Jul 2024 11:42:51 -0400 Subject: [PATCH 004/168] ufs: Convert ufs_check_page() to ufs_check_folio() Includes large folio support in case we decide to support block size > PAGE_SIZE (as with ext2, this support will be limited to machines without HIGHMEM). Reviewed-by: Josef Bacik Signed-off-by: Christian Brauner --- fs/ufs/dir.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 6c3235f426ed..287cab597cc1 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -112,20 +112,18 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, ufs_handle_dirsync(dir); } - -static bool ufs_check_page(struct page *page) +static bool ufs_check_folio(struct folio *folio, char *kaddr) { - struct inode *dir = page->mapping->host; + struct inode *dir = folio->mapping->host; struct super_block *sb = dir->i_sb; - char *kaddr = page_address(page); unsigned offs, rec_len; - unsigned limit = PAGE_SIZE; + unsigned limit = folio_size(folio); const unsigned chunk_mask = UFS_SB(sb)->s_uspi->s_dirblksize - 1; struct ufs_dir_entry *p; char *error; - if ((dir->i_size >> PAGE_SHIFT) == page->index) { - limit = dir->i_size & ~PAGE_MASK; + if (dir->i_size < folio_pos(folio) + limit) { + limit = offset_in_folio(folio, dir->i_size); if (limit & chunk_mask) goto Ebadsize; if (!limit) @@ -150,13 +148,13 @@ static bool ufs_check_page(struct page *page) if (offs != limit) goto Eend; out: - SetPageChecked(page); + folio_set_checked(folio); return true; /* Too bad, we had an error */ Ebadsize: - ufs_error(sb, "ufs_check_page", + ufs_error(sb, __func__, "size of directory #%lu is not a multiple of chunk size", dir->i_ino ); @@ -176,17 +174,17 @@ Espan: Einumber: error = "inode out of bounds"; bad_entry: - ufs_error (sb, "ufs_check_page", "bad entry in directory #%lu: %s - " - "offset=%lu, rec_len=%d, name_len=%d", - dir->i_ino, error, (page->index<i_ino, error, folio_pos(folio) + offs, rec_len, ufs_get_de_namlen(sb, p)); goto fail; Eend: p = (struct ufs_dir_entry *)(kaddr + offs); ufs_error(sb, __func__, "entry in directory #%lu spans the page boundary" - "offset=%lu", - dir->i_ino, (page->index<i_ino, folio_pos(folio) + offs); fail: return false; } @@ -202,7 +200,7 @@ static void *ufs_get_folio(struct inode *dir, unsigned long n, return ERR_CAST(folio); kaddr = kmap(&folio->page); if (unlikely(!folio_test_checked(folio))) { - if (!ufs_check_page(&folio->page)) + if (!ufs_check_folio(folio, kaddr)) goto fail; } *foliop = folio; From e95d2754458a6a0a1c8a5dc1d593d80f6940fbdc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 8 Jul 2024 15:11:00 -0400 Subject: [PATCH 005/168] ufs: Convert ufs_find_entry() to take a folio This matches ext2 and pushes the use of folios out by one layer. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/ufs/dir.c | 17 ++++++----------- fs/ufs/namei.c | 22 +++++++++++----------- fs/ufs/ufs.h | 20 +++++++++++--------- 3 files changed, 28 insertions(+), 31 deletions(-) diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 287cab597cc1..fb56a00b622c 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -76,12 +76,12 @@ ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr) { ino_t res = 0; struct ufs_dir_entry *de; - struct page *page; + struct folio *folio; - de = ufs_find_entry(dir, qstr, &page); + de = ufs_find_entry(dir, qstr, &folio); if (de) { res = fs32_to_cpu(dir->i_sb, de->d_ino); - ufs_put_page(page); + ufs_put_page(&folio->page); } return res; } @@ -255,7 +255,7 @@ struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p) * Entry is guaranteed to be valid. */ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr, - struct page **res_page) + struct folio **foliop) { struct super_block *sb = dir->i_sb; const unsigned char *name = qstr->name; @@ -263,7 +263,6 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr, unsigned reclen = UFS_DIR_REC_LEN(namelen); unsigned long start, n; unsigned long npages = dir_pages(dir); - struct folio *folio; struct ufs_inode_info *ui = UFS_I(dir); struct ufs_dir_entry *de; @@ -272,16 +271,13 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr, if (npages == 0 || namelen > UFS_MAXNAMLEN) goto out; - /* OFFSET_CACHE */ - *res_page = NULL; - start = ui->i_dir_start_lookup; if (start >= npages) start = 0; n = start; do { - char *kaddr = ufs_get_folio(dir, n, &folio); + char *kaddr = ufs_get_folio(dir, n, foliop); if (!IS_ERR(kaddr)) { de = (struct ufs_dir_entry *)kaddr; @@ -291,7 +287,7 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr, goto found; de = ufs_next_entry(sb, de); } - ufs_put_page(&folio->page); + ufs_put_page(&(*foliop)->page); } if (++n >= npages) n = 0; @@ -300,7 +296,6 @@ out: return NULL; found: - *res_page = &folio->page; ui->i_dir_start_lookup = n; return de; } diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 9cad29463791..53e9bfad54df 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -209,14 +209,14 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry) { struct inode * inode = d_inode(dentry); struct ufs_dir_entry *de; - struct page *page; + struct folio *folio; int err = -ENOENT; - de = ufs_find_entry(dir, &dentry->d_name, &page); + de = ufs_find_entry(dir, &dentry->d_name, &folio); if (!de) goto out; - err = ufs_delete_entry(dir, de, page); + err = ufs_delete_entry(dir, de, &folio->page); if (err) goto out; @@ -251,14 +251,14 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct inode *new_inode = d_inode(new_dentry); struct page *dir_page = NULL; struct ufs_dir_entry * dir_de = NULL; - struct page *old_page; + struct folio *old_folio; struct ufs_dir_entry *old_de; int err = -ENOENT; if (flags & ~RENAME_NOREPLACE) return -EINVAL; - old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page); + old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_folio); if (!old_de) goto out; @@ -270,7 +270,7 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, } if (new_inode) { - struct page *new_page; + struct folio *new_folio; struct ufs_dir_entry *new_de; err = -ENOTEMPTY; @@ -278,10 +278,10 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, goto out_dir; err = -ENOENT; - new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page); + new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_folio); if (!new_de) goto out_dir; - ufs_set_link(new_dir, new_de, new_page, old_inode, 1); + ufs_set_link(new_dir, new_de, &new_folio->page, old_inode, 1); inode_set_ctime_current(new_inode); if (dir_de) drop_nlink(new_inode); @@ -300,7 +300,7 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, */ inode_set_ctime_current(old_inode); - ufs_delete_entry(old_dir, old_de, old_page); + ufs_delete_entry(old_dir, old_de, &old_folio->page); mark_inode_dirty(old_inode); if (dir_de) { @@ -321,8 +321,8 @@ out_dir: put_page(dir_page); } out_old: - kunmap(old_page); - put_page(old_page); + kunmap(&old_folio->page); + folio_put(old_folio); out: return err; } diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 6b499180643b..161fe0bb6fd1 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -99,15 +99,17 @@ extern void ufs_put_cylinder (struct super_block *, unsigned); /* dir.c */ extern const struct inode_operations ufs_dir_inode_operations; -extern int ufs_add_link (struct dentry *, struct inode *); -extern ino_t ufs_inode_by_name(struct inode *, const struct qstr *); -extern int ufs_make_empty(struct inode *, struct inode *); -extern struct ufs_dir_entry *ufs_find_entry(struct inode *, const struct qstr *, struct page **); -extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page *); -extern int ufs_empty_dir (struct inode *); -extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **); -extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, - struct page *page, struct inode *inode, bool update_times); + +int ufs_add_link(struct dentry *, struct inode *); +ino_t ufs_inode_by_name(struct inode *, const struct qstr *); +int ufs_make_empty(struct inode *, struct inode *); +struct ufs_dir_entry *ufs_find_entry(struct inode *, const struct qstr *, + struct folio **); +int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page *); +int ufs_empty_dir(struct inode *); +struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **); +void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, + struct page *page, struct inode *inode, bool update_times); /* file.c */ extern const struct inode_operations ufs_file_inode_operations; From 597697c5adf8ac16e8e8a5d55a572a19f2282b88 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 8 Jul 2024 15:23:35 -0400 Subject: [PATCH 006/168] ufs: Convert ufs_set_link() and ufss_dotdot() to take a folio This matches ext2 and removes a few hidden calls to compound_head(). Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/ufs/dir.c | 26 +++++++++++--------------- fs/ufs/namei.c | 16 ++++++++-------- fs/ufs/ufs.h | 4 ++-- 3 files changed, 21 insertions(+), 25 deletions(-) diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index fb56a00b622c..6fcca4dd064a 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -89,23 +89,22 @@ ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr) /* Releases the page */ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, - struct page *page, struct inode *inode, + struct folio *folio, struct inode *inode, bool update_times) { - loff_t pos = page_offset(page) + - (char *) de - (char *) page_address(page); + loff_t pos = folio_pos(folio) + offset_in_folio(folio, de); unsigned len = fs16_to_cpu(dir->i_sb, de->d_reclen); int err; - lock_page(page); - err = ufs_prepare_chunk(page, pos, len); + folio_lock(folio); + err = ufs_prepare_chunk(&folio->page, pos, len); BUG_ON(err); de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino); ufs_set_de_type(dir->i_sb, de, inode->i_mode); - ufs_commit_chunk(page, pos, len); - ufs_put_page(page); + ufs_commit_chunk(&folio->page, pos, len); + ufs_put_page(&folio->page); if (update_times) inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); @@ -233,17 +232,14 @@ ufs_next_entry(struct super_block *sb, struct ufs_dir_entry *p) fs16_to_cpu(sb, p->d_reclen)); } -struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p) +struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct folio **foliop) { - struct folio *folio; - struct ufs_dir_entry *de = ufs_get_folio(dir, 0, &folio); + struct ufs_dir_entry *de = ufs_get_folio(dir, 0, foliop); - if (IS_ERR(de)) - return NULL; - de = ufs_next_entry(dir->i_sb, de); - *p = &folio->page; + if (!IS_ERR(de)) + return ufs_next_entry(dir->i_sb, de); - return de; + return NULL; } /* diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 53e9bfad54df..1759b710d831 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -249,7 +249,7 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); - struct page *dir_page = NULL; + struct folio *dir_folio = NULL; struct ufs_dir_entry * dir_de = NULL; struct folio *old_folio; struct ufs_dir_entry *old_de; @@ -264,7 +264,7 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (S_ISDIR(old_inode->i_mode)) { err = -EIO; - dir_de = ufs_dotdot(old_inode, &dir_page); + dir_de = ufs_dotdot(old_inode, &dir_folio); if (!dir_de) goto out_old; } @@ -281,7 +281,7 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_folio); if (!new_de) goto out_dir; - ufs_set_link(new_dir, new_de, &new_folio->page, old_inode, 1); + ufs_set_link(new_dir, new_de, new_folio, old_inode, 1); inode_set_ctime_current(new_inode); if (dir_de) drop_nlink(new_inode); @@ -305,10 +305,10 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (dir_de) { if (old_dir != new_dir) - ufs_set_link(old_inode, dir_de, dir_page, new_dir, 0); + ufs_set_link(old_inode, dir_de, dir_folio, new_dir, 0); else { - kunmap(dir_page); - put_page(dir_page); + kunmap(&dir_folio->page); + folio_put(dir_folio); } inode_dec_link_count(old_dir); } @@ -317,8 +317,8 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, out_dir: if (dir_de) { - kunmap(dir_page); - put_page(dir_page); + kunmap(&dir_folio->page); + folio_put(dir_folio); } out_old: kunmap(&old_folio->page); diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 161fe0bb6fd1..1ad992ab2855 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -107,9 +107,9 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *, const struct qstr *, struct folio **); int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page *); int ufs_empty_dir(struct inode *); -struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **); +struct ufs_dir_entry *ufs_dotdot(struct inode *, struct folio **); void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, - struct page *page, struct inode *inode, bool update_times); + struct folio *folio, struct inode *inode, bool update_times); /* file.c */ extern const struct inode_operations ufs_file_inode_operations; From 767bd0af25e59e8b75801847ddc7bf0ab186bf3c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 8 Jul 2024 15:32:53 -0400 Subject: [PATCH 007/168] ufs: Convert ufs_delete_entry() to work on a folio Match ext2 and remove a few hidden calls to compound_head(). Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/ufs/dir.c | 29 ++++++++++++++++------------- fs/ufs/namei.c | 4 ++-- fs/ufs/ufs.h | 2 +- 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 6fcca4dd064a..945fff87b385 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -485,19 +485,23 @@ ufs_readdir(struct file *file, struct dir_context *ctx) * previous entry. */ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir, - struct page * page) + struct folio *folio) { struct super_block *sb = inode->i_sb; - char *kaddr = page_address(page); - unsigned from = ((char*)dir - kaddr) & ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1); - unsigned to = ((char*)dir - kaddr) + fs16_to_cpu(sb, dir->d_reclen); + size_t from, to; + char *kaddr; loff_t pos; - struct ufs_dir_entry *pde = NULL; - struct ufs_dir_entry *de = (struct ufs_dir_entry *) (kaddr + from); + struct ufs_dir_entry *de, *pde = NULL; int err; UFSD("ENTER\n"); + from = offset_in_folio(folio, dir); + to = from + fs16_to_cpu(sb, dir->d_reclen); + kaddr = (char *)dir - from; + from &= ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1); + de = (struct ufs_dir_entry *) (kaddr + from); + UFSD("ino %u, reclen %u, namlen %u, name %s\n", fs32_to_cpu(sb, de->d_ino), fs16_to_cpu(sb, de->d_reclen), @@ -514,21 +518,20 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir, de = ufs_next_entry(sb, de); } if (pde) - from = (char*)pde - (char*)page_address(page); - - pos = page_offset(page) + from; - lock_page(page); - err = ufs_prepare_chunk(page, pos, to - from); + from = offset_in_folio(folio, pde); + pos = folio_pos(folio) + from; + folio_lock(folio); + err = ufs_prepare_chunk(&folio->page, pos, to - from); BUG_ON(err); if (pde) pde->d_reclen = cpu_to_fs16(sb, to - from); dir->d_ino = 0; - ufs_commit_chunk(page, pos, to - from); + ufs_commit_chunk(&folio->page, pos, to - from); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); err = ufs_handle_dirsync(inode); out: - ufs_put_page(page); + ufs_put_page(&folio->page); UFSD("EXIT\n"); return err; } diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 1759b710d831..a9b0c15de067 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -216,7 +216,7 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry) if (!de) goto out; - err = ufs_delete_entry(dir, de, &folio->page); + err = ufs_delete_entry(dir, de, folio); if (err) goto out; @@ -300,7 +300,7 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, */ inode_set_ctime_current(old_inode); - ufs_delete_entry(old_dir, old_de, &old_folio->page); + ufs_delete_entry(old_dir, old_de, old_folio); mark_inode_dirty(old_inode); if (dir_de) { diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 1ad992ab2855..a2c762cb65a0 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -105,7 +105,7 @@ ino_t ufs_inode_by_name(struct inode *, const struct qstr *); int ufs_make_empty(struct inode *, struct inode *); struct ufs_dir_entry *ufs_find_entry(struct inode *, const struct qstr *, struct folio **); -int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page *); +int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct folio *); int ufs_empty_dir(struct inode *); struct ufs_dir_entry *ufs_dotdot(struct inode *, struct folio **); void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, From f4a20e53aba74b406f3becf28cc3776ad38153e7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 8 Jul 2024 17:44:48 -0400 Subject: [PATCH 008/168] ufs: Convert ufs_make_empty() to use a folio Removes a few hidden calls to compound_head() and uses kmap_local instead of kmap. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/ufs/dir.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 945fff87b385..6a2a6af38097 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -540,26 +540,25 @@ int ufs_make_empty(struct inode * inode, struct inode *dir) { struct super_block * sb = dir->i_sb; struct address_space *mapping = inode->i_mapping; - struct page *page = grab_cache_page(mapping, 0); + struct folio *folio = filemap_grab_folio(mapping, 0); const unsigned int chunk_size = UFS_SB(sb)->s_uspi->s_dirblksize; struct ufs_dir_entry * de; - char *base; int err; + char *kaddr; - if (!page) - return -ENOMEM; + if (IS_ERR(folio)) + return PTR_ERR(folio); - err = ufs_prepare_chunk(page, 0, chunk_size); + err = ufs_prepare_chunk(&folio->page, 0, chunk_size); if (err) { - unlock_page(page); + folio_unlock(folio); goto fail; } - kmap(page); - base = (char*)page_address(page); - memset(base, 0, PAGE_SIZE); + kaddr = kmap_local_folio(folio, 0); + memset(kaddr, 0, folio_size(folio)); - de = (struct ufs_dir_entry *) base; + de = (struct ufs_dir_entry *)kaddr; de->d_ino = cpu_to_fs32(sb, inode->i_ino); ufs_set_de_type(sb, de, inode->i_mode); @@ -573,12 +572,12 @@ int ufs_make_empty(struct inode * inode, struct inode *dir) de->d_reclen = cpu_to_fs16(sb, chunk_size - UFS_DIR_REC_LEN(1)); ufs_set_de_namlen(sb, de, 2); strcpy (de->d_name, ".."); - kunmap(page); + kunmap_local(kaddr); - ufs_commit_chunk(page, 0, chunk_size); + ufs_commit_chunk(&folio->page, 0, chunk_size); err = ufs_handle_dirsync(inode); fail: - put_page(page); + folio_put(folio); return err; } From 128d1e89acb978bf6a4ec0d04bbfe8cb801965be Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 8 Jul 2024 17:47:06 -0400 Subject: [PATCH 009/168] ufs: Convert ufs_prepare_chunk() to take a folio All callers now have a folio, so convert ufs_prepare_chunk() to take one. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/ufs/dir.c | 8 ++++---- fs/ufs/inode.c | 4 ++-- fs/ufs/util.h | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 6a2a6af38097..a20f66351c66 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -97,7 +97,7 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, int err; folio_lock(folio); - err = ufs_prepare_chunk(&folio->page, pos, len); + err = ufs_prepare_chunk(folio, pos, len); BUG_ON(err); de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino); @@ -366,7 +366,7 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode) got_it: pos = folio_pos(folio) + offset_in_folio(folio, de); - err = ufs_prepare_chunk(&folio->page, pos, rec_len); + err = ufs_prepare_chunk(folio, pos, rec_len); if (err) goto out_unlock; if (de->d_ino) { @@ -521,7 +521,7 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir, from = offset_in_folio(folio, pde); pos = folio_pos(folio) + from; folio_lock(folio); - err = ufs_prepare_chunk(&folio->page, pos, to - from); + err = ufs_prepare_chunk(folio, pos, to - from); BUG_ON(err); if (pde) pde->d_reclen = cpu_to_fs16(sb, to - from); @@ -549,7 +549,7 @@ int ufs_make_empty(struct inode * inode, struct inode *dir) if (IS_ERR(folio)) return PTR_ERR(folio); - err = ufs_prepare_chunk(&folio->page, 0, chunk_size); + err = ufs_prepare_chunk(folio, 0, chunk_size); if (err) { folio_unlock(folio); goto fail; diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index a7bb2e63cdde..0e608fc0d0fd 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -479,9 +479,9 @@ static int ufs_read_folio(struct file *file, struct folio *folio) return block_read_full_folio(folio, ufs_getfrag_block); } -int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len) +int ufs_prepare_chunk(struct folio *folio, loff_t pos, unsigned len) { - return __block_write_begin(page, pos, len, ufs_getfrag_block); + return __block_write_begin(&folio->page, pos, len, ufs_getfrag_block); } static void ufs_truncate_blocks(struct inode *); diff --git a/fs/ufs/util.h b/fs/ufs/util.h index 0ecd2ed792f5..bf708b68f150 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -250,9 +250,9 @@ ufs_set_inode_gid(struct super_block *sb, struct ufs_inode *inode, u32 value) } } -extern dev_t ufs_get_inode_dev(struct super_block *, struct ufs_inode_info *); -extern void ufs_set_inode_dev(struct super_block *, struct ufs_inode_info *, dev_t); -extern int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len); +dev_t ufs_get_inode_dev(struct super_block *, struct ufs_inode_info *); +void ufs_set_inode_dev(struct super_block *, struct ufs_inode_info *, dev_t); +int ufs_prepare_chunk(struct folio *folio, loff_t pos, unsigned len); /* * These functions manipulate ufs buffers From 0f3e63f30bf5ba214561e127a2c21c2d46ded141 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 8 Jul 2024 17:49:11 -0400 Subject: [PATCH 010/168] ufs; Convert ufs_commit_chunk() to take a folio All callers now have a folio, so pass it in. Saves a call to compound_head(). Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/ufs/dir.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index a20f66351c66..71685491d5f6 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -42,18 +42,18 @@ static inline int ufs_match(struct super_block *sb, int len, return !memcmp(name, de->d_name, len); } -static void ufs_commit_chunk(struct page *page, loff_t pos, unsigned len) +static void ufs_commit_chunk(struct folio *folio, loff_t pos, unsigned len) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; struct inode *dir = mapping->host; inode_inc_iversion(dir); - block_write_end(NULL, mapping, pos, len, len, page, NULL); + block_write_end(NULL, mapping, pos, len, len, &folio->page, NULL); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); mark_inode_dirty(dir); } - unlock_page(page); + folio_unlock(folio); } static int ufs_handle_dirsync(struct inode *dir) @@ -103,7 +103,7 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino); ufs_set_de_type(dir->i_sb, de, inode->i_mode); - ufs_commit_chunk(&folio->page, pos, len); + ufs_commit_chunk(folio, pos, len); ufs_put_page(&folio->page); if (update_times) inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); @@ -383,7 +383,7 @@ got_it: de->d_ino = cpu_to_fs32(sb, inode->i_ino); ufs_set_de_type(sb, de, inode->i_mode); - ufs_commit_chunk(&folio->page, pos, rec_len); + ufs_commit_chunk(folio, pos, rec_len); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); @@ -526,7 +526,7 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir, if (pde) pde->d_reclen = cpu_to_fs16(sb, to - from); dir->d_ino = 0; - ufs_commit_chunk(&folio->page, pos, to - from); + ufs_commit_chunk(folio, pos, to - from); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); err = ufs_handle_dirsync(inode); @@ -574,7 +574,7 @@ int ufs_make_empty(struct inode * inode, struct inode *dir) strcpy (de->d_name, ".."); kunmap_local(kaddr); - ufs_commit_chunk(&folio->page, 0, chunk_size); + ufs_commit_chunk(folio, 0, chunk_size); err = ufs_handle_dirsync(inode); fail: folio_put(folio); From 516b97cf03dd630ad90c9329de312c755690650a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 8 Jul 2024 18:01:47 -0400 Subject: [PATCH 011/168] ufs: Convert directory handling to kmap_local Remove kmap use and use folio_release_kmap() instead of ufs_put_page(). Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/ufs/dir.c | 30 ++++++++++++------------------ fs/ufs/namei.c | 15 +++++---------- 2 files changed, 17 insertions(+), 28 deletions(-) diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 71685491d5f6..3b3cd84f1f7f 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -66,12 +66,6 @@ static int ufs_handle_dirsync(struct inode *dir) return err; } -static inline void ufs_put_page(struct page *page) -{ - kunmap(page); - put_page(page); -} - ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr) { ino_t res = 0; @@ -81,7 +75,7 @@ ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr) de = ufs_find_entry(dir, qstr, &folio); if (de) { res = fs32_to_cpu(dir->i_sb, de->d_ino); - ufs_put_page(&folio->page); + folio_release_kmap(folio, de); } return res; } @@ -104,7 +98,7 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, ufs_set_de_type(dir->i_sb, de, inode->i_mode); ufs_commit_chunk(folio, pos, len); - ufs_put_page(&folio->page); + folio_release_kmap(folio, de); if (update_times) inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); @@ -197,7 +191,7 @@ static void *ufs_get_folio(struct inode *dir, unsigned long n, if (IS_ERR(folio)) return ERR_CAST(folio); - kaddr = kmap(&folio->page); + kaddr = kmap_local_folio(folio, 0); if (unlikely(!folio_test_checked(folio))) { if (!ufs_check_folio(folio, kaddr)) goto fail; @@ -206,7 +200,7 @@ static void *ufs_get_folio(struct inode *dir, unsigned long n, return kaddr; fail: - ufs_put_page(&folio->page); + folio_release_kmap(folio, kaddr); return ERR_PTR(-EIO); } @@ -283,7 +277,7 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr, goto found; de = ufs_next_entry(sb, de); } - ufs_put_page(&(*foliop)->page); + folio_release_kmap(*foliop, kaddr); } if (++n >= npages) n = 0; @@ -359,7 +353,7 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode) de = (struct ufs_dir_entry *) ((char *) de + rec_len); } folio_unlock(folio); - ufs_put_page(&folio->page); + folio_release_kmap(folio, kaddr); } BUG(); return -EINVAL; @@ -390,7 +384,7 @@ got_it: err = ufs_handle_dirsync(dir); /* OFFSET_CACHE */ out_put: - ufs_put_page(&folio->page); + folio_release_kmap(folio, de); return err; out_unlock: folio_unlock(folio); @@ -468,13 +462,13 @@ ufs_readdir(struct file *file, struct dir_context *ctx) ufs_get_de_namlen(sb, de), fs32_to_cpu(sb, de->d_ino), d_type)) { - ufs_put_page(&folio->page); + folio_release_kmap(folio, de); return 0; } } ctx->pos += fs16_to_cpu(sb, de->d_reclen); } - ufs_put_page(&folio->page); + folio_release_kmap(folio, kaddr); } return 0; } @@ -531,7 +525,7 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir, mark_inode_dirty(inode); err = ufs_handle_dirsync(inode); out: - ufs_put_page(&folio->page); + folio_release_kmap(folio, kaddr); UFSD("EXIT\n"); return err; } @@ -624,12 +618,12 @@ int ufs_empty_dir(struct inode * inode) } de = ufs_next_entry(sb, de); } - ufs_put_page(&folio->page); + folio_release_kmap(folio, kaddr); } return 1; not_empty: - ufs_put_page(&folio->page); + folio_release_kmap(folio, kaddr); return 0; } diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index a9b0c15de067..24bd12186647 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -306,23 +306,18 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (dir_de) { if (old_dir != new_dir) ufs_set_link(old_inode, dir_de, dir_folio, new_dir, 0); - else { - kunmap(&dir_folio->page); - folio_put(dir_folio); - } + else + folio_release_kmap(dir_folio, new_dir); inode_dec_link_count(old_dir); } return 0; out_dir: - if (dir_de) { - kunmap(&dir_folio->page); - folio_put(dir_folio); - } + if (dir_de) + folio_release_kmap(dir_folio, dir_de); out_old: - kunmap(&old_folio->page); - folio_put(old_folio); + folio_release_kmap(old_folio, old_de); out: return err; } From a3b4537f82425f6cb37f299477cb38cc81cbe30a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 9 Jul 2024 09:41:38 -0400 Subject: [PATCH 012/168] sysv: Convert dir_get_page() to dir_get_folio() Remove a few conversions between page and folio. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/sysv/dir.c | 80 +++++++++++++++++++++++++++------------------------ 1 file changed, 42 insertions(+), 38 deletions(-) diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 2e126d72d619..1f95f82f8941 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -52,20 +52,21 @@ static int sysv_handle_dirsync(struct inode *dir) } /* - * Calls to dir_get_page()/unmap_and_put_page() must be nested according to the + * Calls to dir_get_folio()/folio_release_kmap() must be nested according to the * rules documented in mm/highmem.rst. * - * NOTE: sysv_find_entry() and sysv_dotdot() act as calls to dir_get_page() + * NOTE: sysv_find_entry() and sysv_dotdot() act as calls to dir_get_folio() * and must be treated accordingly for nesting purposes. */ -static void *dir_get_page(struct inode *dir, unsigned long n, struct page **p) +static void *dir_get_folio(struct inode *dir, unsigned long n, + struct folio **foliop) { - struct address_space *mapping = dir->i_mapping; - struct page *page = read_mapping_page(mapping, n, NULL); - if (IS_ERR(page)) - return ERR_CAST(page); - *p = page; - return kmap_local_page(page); + struct folio *folio = read_mapping_folio(dir->i_mapping, n, NULL); + + if (IS_ERR(folio)) + return ERR_CAST(folio); + *foliop = folio; + return kmap_local_folio(folio, 0); } static int sysv_readdir(struct file *file, struct dir_context *ctx) @@ -87,9 +88,9 @@ static int sysv_readdir(struct file *file, struct dir_context *ctx) for ( ; n < npages; n++, offset = 0) { char *kaddr, *limit; struct sysv_dir_entry *de; - struct page *page; + struct folio *folio; - kaddr = dir_get_page(inode, n, &page); + kaddr = dir_get_folio(inode, n, &folio); if (IS_ERR(kaddr)) continue; de = (struct sysv_dir_entry *)(kaddr+offset); @@ -103,11 +104,11 @@ static int sysv_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit(ctx, name, strnlen(name,SYSV_NAMELEN), fs16_to_cpu(SYSV_SB(sb), de->inode), DT_UNKNOWN)) { - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); return 0; } } - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); } return 0; } @@ -133,7 +134,7 @@ static inline int namecompare(int len, int maxlen, * * On Success unmap_and_put_page() should be called on *res_page. * - * sysv_find_entry() acts as a call to dir_get_page() and must be treated + * sysv_find_entry() acts as a call to dir_get_folio() and must be treated * accordingly for nesting purposes. */ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_page) @@ -143,7 +144,7 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_ struct inode * dir = d_inode(dentry->d_parent); unsigned long start, n; unsigned long npages = dir_pages(dir); - struct page *page = NULL; + struct folio *folio = NULL; struct sysv_dir_entry *de; *res_page = NULL; @@ -154,7 +155,7 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_ n = start; do { - char *kaddr = dir_get_page(dir, n, &page); + char *kaddr = dir_get_folio(dir, n, &folio); if (!IS_ERR(kaddr)) { de = (struct sysv_dir_entry *)kaddr; @@ -166,7 +167,7 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_ name, de->name)) goto found; } - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); } if (++n >= npages) @@ -177,7 +178,7 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_ found: SYSV_I(dir)->i_dir_start_lookup = n; - *res_page = page; + *res_page = &folio->page; return de; } @@ -186,7 +187,7 @@ int sysv_add_link(struct dentry *dentry, struct inode *inode) struct inode *dir = d_inode(dentry->d_parent); const char * name = dentry->d_name.name; int namelen = dentry->d_name.len; - struct page *page = NULL; + struct folio *folio = NULL; struct sysv_dir_entry * de; unsigned long npages = dir_pages(dir); unsigned long n; @@ -196,7 +197,7 @@ int sysv_add_link(struct dentry *dentry, struct inode *inode) /* We take care of directory expansion in the same loop */ for (n = 0; n <= npages; n++) { - kaddr = dir_get_page(dir, n, &page); + kaddr = dir_get_folio(dir, n, &folio); if (IS_ERR(kaddr)) return PTR_ERR(kaddr); de = (struct sysv_dir_entry *)kaddr; @@ -206,33 +207,33 @@ int sysv_add_link(struct dentry *dentry, struct inode *inode) goto got_it; err = -EEXIST; if (namecompare(namelen, SYSV_NAMELEN, name, de->name)) - goto out_page; + goto out_folio; de++; } - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); } BUG(); return -EINVAL; got_it: - pos = page_offset(page) + offset_in_page(de); - lock_page(page); - err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE); + pos = folio_pos(folio) + offset_in_folio(folio, de); + folio_lock(folio); + err = sysv_prepare_chunk(&folio->page, pos, SYSV_DIRSIZE); if (err) goto out_unlock; memcpy (de->name, name, namelen); memset (de->name + namelen, 0, SYSV_DIRSIZE - namelen - 2); de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); - dir_commit_chunk(page, pos, SYSV_DIRSIZE); + dir_commit_chunk(&folio->page, pos, SYSV_DIRSIZE); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); err = sysv_handle_dirsync(dir); -out_page: - unmap_and_put_page(page, kaddr); +out_folio: + folio_release_kmap(folio, kaddr); return err; out_unlock: - unlock_page(page); - goto out_page; + folio_unlock(folio); + goto out_folio; } int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page) @@ -292,19 +293,19 @@ fail: int sysv_empty_dir(struct inode * inode) { struct super_block *sb = inode->i_sb; - struct page *page = NULL; + struct folio *folio = NULL; unsigned long i, npages = dir_pages(inode); char *kaddr; for (i = 0; i < npages; i++) { struct sysv_dir_entry *de; - kaddr = dir_get_page(inode, i, &page); + kaddr = dir_get_folio(inode, i, &folio); if (IS_ERR(kaddr)) continue; de = (struct sysv_dir_entry *)kaddr; - kaddr += PAGE_SIZE-SYSV_DIRSIZE; + kaddr += folio_size(folio) - SYSV_DIRSIZE; for ( ;(char *)de <= kaddr; de++) { if (!de->inode) @@ -321,12 +322,12 @@ int sysv_empty_dir(struct inode * inode) if (de->name[1] != '.' || de->name[2]) goto not_empty; } - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); } return 1; not_empty: - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); return 0; } @@ -352,18 +353,21 @@ int sysv_set_link(struct sysv_dir_entry *de, struct page *page, } /* - * Calls to dir_get_page()/unmap_and_put_page() must be nested according to the + * Calls to dir_get_folio()/folio_release_kmap() must be nested according to the * rules documented in mm/highmem.rst. * - * sysv_dotdot() acts as a call to dir_get_page() and must be treated + * sysv_dotdot() acts as a call to dir_get_folio() and must be treated * accordingly for nesting purposes. */ struct sysv_dir_entry *sysv_dotdot(struct inode *dir, struct page **p) { - struct sysv_dir_entry *de = dir_get_page(dir, 0, p); + struct folio *folio; + + struct sysv_dir_entry *de = dir_get_folio(dir, 0, &folio); if (IS_ERR(de)) return NULL; + *p = &folio->page; /* ".." is the second directory entry */ return de + 1; } From ba36ee530896bb1094f1d86b5456d776c93a5516 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 9 Jul 2024 10:31:45 -0400 Subject: [PATCH 013/168] sysv: Convert sysv_find_entry() to take a folio Remove a few hidden calls to compound_head(). Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/sysv/dir.c | 25 ++++++++++--------------- fs/sysv/namei.c | 24 ++++++++++++------------ fs/sysv/sysv.h | 16 ++++++++-------- 3 files changed, 30 insertions(+), 35 deletions(-) diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 1f95f82f8941..5b2e3c7c2971 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -127,39 +127,35 @@ static inline int namecompare(int len, int maxlen, /* * sysv_find_entry() * - * finds an entry in the specified directory with the wanted name. It - * returns the cache buffer in which the entry was found, and the entry - * itself (as a parameter - res_dir). It does NOT read the inode of the + * finds an entry in the specified directory with the wanted name. + * It does NOT read the inode of the * entry - you'll have to do that yourself if you want to. * - * On Success unmap_and_put_page() should be called on *res_page. + * On Success folio_release_kmap() should be called on *foliop. * * sysv_find_entry() acts as a call to dir_get_folio() and must be treated * accordingly for nesting purposes. */ -struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_page) +struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct folio **foliop) { const char * name = dentry->d_name.name; int namelen = dentry->d_name.len; struct inode * dir = d_inode(dentry->d_parent); unsigned long start, n; unsigned long npages = dir_pages(dir); - struct folio *folio = NULL; struct sysv_dir_entry *de; - *res_page = NULL; - start = SYSV_I(dir)->i_dir_start_lookup; if (start >= npages) start = 0; n = start; do { - char *kaddr = dir_get_folio(dir, n, &folio); + char *kaddr = dir_get_folio(dir, n, foliop); if (!IS_ERR(kaddr)) { de = (struct sysv_dir_entry *)kaddr; - kaddr += PAGE_SIZE - SYSV_DIRSIZE; + kaddr += folio_size(*foliop) - SYSV_DIRSIZE; for ( ; (char *) de <= kaddr ; de++) { if (!de->inode) continue; @@ -167,7 +163,7 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_ name, de->name)) goto found; } - folio_release_kmap(folio, kaddr); + folio_release_kmap(*foliop, kaddr); } if (++n >= npages) @@ -178,7 +174,6 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_ found: SYSV_I(dir)->i_dir_start_lookup = n; - *res_page = &folio->page; return de; } @@ -374,13 +369,13 @@ struct sysv_dir_entry *sysv_dotdot(struct inode *dir, struct page **p) ino_t sysv_inode_by_name(struct dentry *dentry) { - struct page *page; - struct sysv_dir_entry *de = sysv_find_entry (dentry, &page); + struct folio *folio; + struct sysv_dir_entry *de = sysv_find_entry (dentry, &folio); ino_t res = 0; if (de) { res = fs16_to_cpu(SYSV_SB(dentry->d_sb), de->inode); - unmap_and_put_page(page, de); + folio_release_kmap(folio, de); } return res; } diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index d6b73798071b..970043fe49ee 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -151,20 +151,20 @@ out_dir: static int sysv_unlink(struct inode * dir, struct dentry * dentry) { struct inode * inode = d_inode(dentry); - struct page * page; + struct folio *folio; struct sysv_dir_entry * de; int err; - de = sysv_find_entry(dentry, &page); + de = sysv_find_entry(dentry, &folio); if (!de) return -ENOENT; - err = sysv_delete_entry(de, page); + err = sysv_delete_entry(de, &folio->page); if (!err) { inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); inode_dec_link_count(inode); } - unmap_and_put_page(page, de); + folio_release_kmap(folio, de); return err; } @@ -196,14 +196,14 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct inode * new_inode = d_inode(new_dentry); struct page * dir_page = NULL; struct sysv_dir_entry * dir_de = NULL; - struct page * old_page; + struct folio *old_folio; struct sysv_dir_entry * old_de; int err = -ENOENT; if (flags & ~RENAME_NOREPLACE) return -EINVAL; - old_de = sysv_find_entry(old_dentry, &old_page); + old_de = sysv_find_entry(old_dentry, &old_folio); if (!old_de) goto out; @@ -215,7 +215,7 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, } if (new_inode) { - struct page * new_page; + struct folio *new_folio; struct sysv_dir_entry * new_de; err = -ENOTEMPTY; @@ -223,11 +223,11 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, goto out_dir; err = -ENOENT; - new_de = sysv_find_entry(new_dentry, &new_page); + new_de = sysv_find_entry(new_dentry, &new_folio); if (!new_de) goto out_dir; - err = sysv_set_link(new_de, new_page, old_inode); - unmap_and_put_page(new_page, new_de); + err = sysv_set_link(new_de, &new_folio->page, old_inode); + folio_release_kmap(new_folio, new_de); if (err) goto out_dir; inode_set_ctime_current(new_inode); @@ -242,7 +242,7 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, inode_inc_link_count(new_dir); } - err = sysv_delete_entry(old_de, old_page); + err = sysv_delete_entry(old_de, &old_folio->page); if (err) goto out_dir; @@ -258,7 +258,7 @@ out_dir: if (dir_de) unmap_and_put_page(dir_page, dir_de); out_old: - unmap_and_put_page(old_page, old_de); + folio_release_kmap(old_folio, old_de); out: return err; } diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index e3f988b469ee..be15c659a027 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -148,15 +148,15 @@ extern void sysv_destroy_icache(void); /* dir.c */ -extern struct sysv_dir_entry *sysv_find_entry(struct dentry *, struct page **); -extern int sysv_add_link(struct dentry *, struct inode *); -extern int sysv_delete_entry(struct sysv_dir_entry *, struct page *); -extern int sysv_make_empty(struct inode *, struct inode *); -extern int sysv_empty_dir(struct inode *); -extern int sysv_set_link(struct sysv_dir_entry *, struct page *, +struct sysv_dir_entry *sysv_find_entry(struct dentry *, struct folio **); +int sysv_add_link(struct dentry *, struct inode *); +int sysv_delete_entry(struct sysv_dir_entry *, struct page *); +int sysv_make_empty(struct inode *, struct inode *); +int sysv_empty_dir(struct inode *); +int sysv_set_link(struct sysv_dir_entry *, struct page *, struct inode *); -extern struct sysv_dir_entry *sysv_dotdot(struct inode *, struct page **); -extern ino_t sysv_inode_by_name(struct dentry *); +struct sysv_dir_entry *sysv_dotdot(struct inode *, struct page **); +ino_t sysv_inode_by_name(struct dentry *); extern const struct inode_operations sysv_file_inode_operations; From 1cfdaf9708ba7f2ff170723999fcdf87e9afb843 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 9 Jul 2024 10:42:23 -0400 Subject: [PATCH 014/168] sysv: Convert sysv_set_link() and sysv_dotdot() to take a folio This matches ext2 and removes a few hidden calls to compound_head(). Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/sysv/dir.c | 23 ++++++++++------------- fs/sysv/namei.c | 10 +++++----- fs/sysv/sysv.h | 4 ++-- 3 files changed, 17 insertions(+), 20 deletions(-) diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 5b2e3c7c2971..ebccf7bb5b69 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -327,21 +327,21 @@ not_empty: } /* Releases the page */ -int sysv_set_link(struct sysv_dir_entry *de, struct page *page, - struct inode *inode) +int sysv_set_link(struct sysv_dir_entry *de, struct folio *folio, + struct inode *inode) { - struct inode *dir = page->mapping->host; - loff_t pos = page_offset(page) + offset_in_page(de); + struct inode *dir = folio->mapping->host; + loff_t pos = folio_pos(folio) + offset_in_folio(folio, de); int err; - lock_page(page); - err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE); + folio_lock(folio); + err = sysv_prepare_chunk(&folio->page, pos, SYSV_DIRSIZE); if (err) { - unlock_page(page); + folio_unlock(folio); return err; } de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); - dir_commit_chunk(page, pos, SYSV_DIRSIZE); + dir_commit_chunk(&folio->page, pos, SYSV_DIRSIZE); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); return sysv_handle_dirsync(inode); @@ -354,15 +354,12 @@ int sysv_set_link(struct sysv_dir_entry *de, struct page *page, * sysv_dotdot() acts as a call to dir_get_folio() and must be treated * accordingly for nesting purposes. */ -struct sysv_dir_entry *sysv_dotdot(struct inode *dir, struct page **p) +struct sysv_dir_entry *sysv_dotdot(struct inode *dir, struct folio **foliop) { - struct folio *folio; - - struct sysv_dir_entry *de = dir_get_folio(dir, 0, &folio); + struct sysv_dir_entry *de = dir_get_folio(dir, 0, foliop); if (IS_ERR(de)) return NULL; - *p = &folio->page; /* ".." is the second directory entry */ return de + 1; } diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index 970043fe49ee..ef4d91431225 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -194,7 +194,7 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, { struct inode * old_inode = d_inode(old_dentry); struct inode * new_inode = d_inode(new_dentry); - struct page * dir_page = NULL; + struct folio *dir_folio; struct sysv_dir_entry * dir_de = NULL; struct folio *old_folio; struct sysv_dir_entry * old_de; @@ -209,7 +209,7 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (S_ISDIR(old_inode->i_mode)) { err = -EIO; - dir_de = sysv_dotdot(old_inode, &dir_page); + dir_de = sysv_dotdot(old_inode, &dir_folio); if (!dir_de) goto out_old; } @@ -226,7 +226,7 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, new_de = sysv_find_entry(new_dentry, &new_folio); if (!new_de) goto out_dir; - err = sysv_set_link(new_de, &new_folio->page, old_inode); + err = sysv_set_link(new_de, new_folio, old_inode); folio_release_kmap(new_folio, new_de); if (err) goto out_dir; @@ -249,14 +249,14 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, mark_inode_dirty(old_inode); if (dir_de) { - err = sysv_set_link(dir_de, dir_page, new_dir); + err = sysv_set_link(dir_de, dir_folio, new_dir); if (!err) inode_dec_link_count(old_dir); } out_dir: if (dir_de) - unmap_and_put_page(dir_page, dir_de); + folio_release_kmap(dir_folio, dir_de); out_old: folio_release_kmap(old_folio, old_de); out: diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index be15c659a027..ee90af7dbed9 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -153,9 +153,9 @@ int sysv_add_link(struct dentry *, struct inode *); int sysv_delete_entry(struct sysv_dir_entry *, struct page *); int sysv_make_empty(struct inode *, struct inode *); int sysv_empty_dir(struct inode *); -int sysv_set_link(struct sysv_dir_entry *, struct page *, +int sysv_set_link(struct sysv_dir_entry *, struct folio *, struct inode *); -struct sysv_dir_entry *sysv_dotdot(struct inode *, struct page **); +struct sysv_dir_entry *sysv_dotdot(struct inode *, struct folio **); ino_t sysv_inode_by_name(struct dentry *); From 9b1cf7790e5ae631839e06b1f96a9d05e088b450 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 9 Jul 2024 10:45:23 -0400 Subject: [PATCH 015/168] sysv: Convert sysv_delete_entry() to work on a folio Match ext2 and remove a few hidden calls to compound_head(). Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/sysv/dir.c | 14 +++++++------- fs/sysv/namei.c | 4 ++-- fs/sysv/sysv.h | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index ebccf7bb5b69..0b5727510bdd 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -231,20 +231,20 @@ out_unlock: goto out_folio; } -int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page) +int sysv_delete_entry(struct sysv_dir_entry *de, struct folio *folio) { - struct inode *inode = page->mapping->host; - loff_t pos = page_offset(page) + offset_in_page(de); + struct inode *inode = folio->mapping->host; + loff_t pos = folio_pos(folio) + offset_in_folio(folio, de); int err; - lock_page(page); - err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE); + folio_lock(folio); + err = sysv_prepare_chunk(&folio->page, pos, SYSV_DIRSIZE); if (err) { - unlock_page(page); + folio_unlock(folio); return err; } de->inode = 0; - dir_commit_chunk(page, pos, SYSV_DIRSIZE); + dir_commit_chunk(&folio->page, pos, SYSV_DIRSIZE); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); return sysv_handle_dirsync(inode); diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index ef4d91431225..fb8bd8437872 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -159,7 +159,7 @@ static int sysv_unlink(struct inode * dir, struct dentry * dentry) if (!de) return -ENOENT; - err = sysv_delete_entry(de, &folio->page); + err = sysv_delete_entry(de, folio); if (!err) { inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); inode_dec_link_count(inode); @@ -242,7 +242,7 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, inode_inc_link_count(new_dir); } - err = sysv_delete_entry(old_de, &old_folio->page); + err = sysv_delete_entry(old_de, old_folio); if (err) goto out_dir; diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index ee90af7dbed9..fec9f6b883d5 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -150,7 +150,7 @@ extern void sysv_destroy_icache(void); /* dir.c */ struct sysv_dir_entry *sysv_find_entry(struct dentry *, struct folio **); int sysv_add_link(struct dentry *, struct inode *); -int sysv_delete_entry(struct sysv_dir_entry *, struct page *); +int sysv_delete_entry(struct sysv_dir_entry *, struct folio *); int sysv_make_empty(struct inode *, struct inode *); int sysv_empty_dir(struct inode *); int sysv_set_link(struct sysv_dir_entry *, struct folio *, From 7f4fb150631bc2b1de11938f9fc9f8c60abae591 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 9 Jul 2024 10:49:54 -0400 Subject: [PATCH 016/168] sysv: Convert sysv_make_empty() to use a folio Removes a few hidden calls to compound_head(). Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/sysv/dir.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 0b5727510bdd..5f91a82a2966 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -252,33 +252,33 @@ int sysv_delete_entry(struct sysv_dir_entry *de, struct folio *folio) int sysv_make_empty(struct inode *inode, struct inode *dir) { - struct page *page = grab_cache_page(inode->i_mapping, 0); + struct folio *folio = filemap_grab_folio(inode->i_mapping, 0); struct sysv_dir_entry * de; - char *base; + char *kaddr; int err; - if (!page) - return -ENOMEM; - err = sysv_prepare_chunk(page, 0, 2 * SYSV_DIRSIZE); + if (IS_ERR(folio)) + return PTR_ERR(folio); + err = sysv_prepare_chunk(&folio->page, 0, 2 * SYSV_DIRSIZE); if (err) { - unlock_page(page); + folio_unlock(folio); goto fail; } - base = kmap_local_page(page); - memset(base, 0, PAGE_SIZE); + kaddr = kmap_local_folio(folio, 0); + memset(kaddr, 0, folio_size(folio)); - de = (struct sysv_dir_entry *) base; + de = (struct sysv_dir_entry *)kaddr; de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); strcpy(de->name,"."); de++; de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), dir->i_ino); strcpy(de->name,".."); - kunmap_local(base); - dir_commit_chunk(page, 0, 2 * SYSV_DIRSIZE); + kunmap_local(kaddr); + dir_commit_chunk(&folio->page, 0, 2 * SYSV_DIRSIZE); err = sysv_handle_dirsync(inode); fail: - put_page(page); + folio_put(folio); return err; } From d3a2fa28fe516cf9621ffdafde35308bebc4714c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 9 Jul 2024 10:52:44 -0400 Subject: [PATCH 017/168] sysv: Convert sysv_prepare_chunk() to take a folio All callers now have a folio, so convert sysv_prepare_chunk() to take one. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/sysv/dir.c | 8 ++++---- fs/sysv/itree.c | 4 ++-- fs/sysv/sysv.h | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 5f91a82a2966..43615b803fee 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -213,7 +213,7 @@ int sysv_add_link(struct dentry *dentry, struct inode *inode) got_it: pos = folio_pos(folio) + offset_in_folio(folio, de); folio_lock(folio); - err = sysv_prepare_chunk(&folio->page, pos, SYSV_DIRSIZE); + err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE); if (err) goto out_unlock; memcpy (de->name, name, namelen); @@ -238,7 +238,7 @@ int sysv_delete_entry(struct sysv_dir_entry *de, struct folio *folio) int err; folio_lock(folio); - err = sysv_prepare_chunk(&folio->page, pos, SYSV_DIRSIZE); + err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE); if (err) { folio_unlock(folio); return err; @@ -259,7 +259,7 @@ int sysv_make_empty(struct inode *inode, struct inode *dir) if (IS_ERR(folio)) return PTR_ERR(folio); - err = sysv_prepare_chunk(&folio->page, 0, 2 * SYSV_DIRSIZE); + err = sysv_prepare_chunk(folio, 0, 2 * SYSV_DIRSIZE); if (err) { folio_unlock(folio); goto fail; @@ -335,7 +335,7 @@ int sysv_set_link(struct sysv_dir_entry *de, struct folio *folio, int err; folio_lock(folio); - err = sysv_prepare_chunk(&folio->page, pos, SYSV_DIRSIZE); + err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE); if (err) { folio_unlock(folio); return err; diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index 19bcb51a2203..c8511e286673 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -466,9 +466,9 @@ static int sysv_read_folio(struct file *file, struct folio *folio) return block_read_full_folio(folio, get_block); } -int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len) +int sysv_prepare_chunk(struct folio *folio, loff_t pos, unsigned len) { - return __block_write_begin(page, pos, len, get_block); + return __block_write_begin(&folio->page, pos, len, get_block); } static void sysv_write_failed(struct address_space *mapping, loff_t to) diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index fec9f6b883d5..0a48b2e7edb1 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -133,8 +133,8 @@ extern void sysv_free_block(struct super_block *, sysv_zone_t); extern unsigned long sysv_count_free_blocks(struct super_block *); /* itree.c */ -extern void sysv_truncate(struct inode *); -extern int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len); +void sysv_truncate(struct inode *); +int sysv_prepare_chunk(struct folio *folio, loff_t pos, unsigned len); /* inode.c */ extern struct inode *sysv_iget(struct super_block *, unsigned int); From 00753fb5f2406fb852e79d9c6e25328bab353388 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 9 Jul 2024 10:54:22 -0400 Subject: [PATCH 018/168] sysv: Convert dir_commit_chunk() to take a folio All callers now have a folio, so pass it in. Saves a call to compound_head(). Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/sysv/dir.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 43615b803fee..27eaa5273ba7 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -28,17 +28,17 @@ const struct file_operations sysv_dir_operations = { .fsync = generic_file_fsync, }; -static void dir_commit_chunk(struct page *page, loff_t pos, unsigned len) +static void dir_commit_chunk(struct folio *folio, loff_t pos, unsigned len) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; struct inode *dir = mapping->host; - block_write_end(NULL, mapping, pos, len, len, page, NULL); + block_write_end(NULL, mapping, pos, len, len, &folio->page, NULL); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); mark_inode_dirty(dir); } - unlock_page(page); + folio_unlock(folio); } static int sysv_handle_dirsync(struct inode *dir) @@ -219,7 +219,7 @@ got_it: memcpy (de->name, name, namelen); memset (de->name + namelen, 0, SYSV_DIRSIZE - namelen - 2); de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); - dir_commit_chunk(&folio->page, pos, SYSV_DIRSIZE); + dir_commit_chunk(folio, pos, SYSV_DIRSIZE); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); err = sysv_handle_dirsync(dir); @@ -244,7 +244,7 @@ int sysv_delete_entry(struct sysv_dir_entry *de, struct folio *folio) return err; } de->inode = 0; - dir_commit_chunk(&folio->page, pos, SYSV_DIRSIZE); + dir_commit_chunk(folio, pos, SYSV_DIRSIZE); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); return sysv_handle_dirsync(inode); @@ -275,7 +275,7 @@ int sysv_make_empty(struct inode *inode, struct inode *dir) strcpy(de->name,".."); kunmap_local(kaddr); - dir_commit_chunk(&folio->page, 0, 2 * SYSV_DIRSIZE); + dir_commit_chunk(folio, 0, 2 * SYSV_DIRSIZE); err = sysv_handle_dirsync(inode); fail: folio_put(folio); @@ -341,7 +341,7 @@ int sysv_set_link(struct sysv_dir_entry *de, struct folio *folio, return err; } de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); - dir_commit_chunk(&folio->page, pos, SYSV_DIRSIZE); + dir_commit_chunk(folio, pos, SYSV_DIRSIZE); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); return sysv_handle_dirsync(inode); From b2aa61556fcfa83639507d329d48cd6e2f7ac487 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 9 Jul 2024 23:08:41 -0400 Subject: [PATCH 019/168] qnx6: Convert qnx6_get_page() to qnx6_get_folio() Match the ext2 calling convention by returning the address and setting the folio return pointer. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/qnx6/dir.c | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c index c1cfb8a19e9d..bc7f20dda579 100644 --- a/fs/qnx6/dir.c +++ b/fs/qnx6/dir.c @@ -24,13 +24,15 @@ static unsigned qnx6_lfile_checksum(char *name, unsigned size) return crc; } -static struct page *qnx6_get_page(struct inode *dir, unsigned long n) +static void *qnx6_get_folio(struct inode *dir, unsigned long n, + struct folio **foliop) { - struct address_space *mapping = dir->i_mapping; - struct page *page = read_mapping_page(mapping, n, NULL); - if (!IS_ERR(page)) - kmap(page); - return page; + struct folio *folio = read_mapping_folio(dir->i_mapping, n, NULL); + + if (IS_ERR(folio)) + return folio; + *foliop = folio; + return kmap(&folio->page); } static unsigned last_entry(struct inode *inode, unsigned long page_nr) @@ -117,26 +119,27 @@ static int qnx6_readdir(struct file *file, struct dir_context *ctx) loff_t pos = ctx->pos & ~(QNX6_DIR_ENTRY_SIZE - 1); unsigned long npages = dir_pages(inode); unsigned long n = pos >> PAGE_SHIFT; - unsigned start = (pos & ~PAGE_MASK) / QNX6_DIR_ENTRY_SIZE; + unsigned offset = (pos & ~PAGE_MASK) / QNX6_DIR_ENTRY_SIZE; bool done = false; ctx->pos = pos; if (ctx->pos >= inode->i_size) return 0; - for ( ; !done && n < npages; n++, start = 0) { - struct page *page = qnx6_get_page(inode, n); - int limit = last_entry(inode, n); + for ( ; !done && n < npages; n++, offset = 0) { struct qnx6_dir_entry *de; - int i = start; + struct folio *folio; + char *kaddr = qnx6_get_folio(inode, n, &folio); + char *limit; - if (IS_ERR(page)) { + if (IS_ERR(kaddr)) { pr_err("%s(): read failed\n", __func__); ctx->pos = (n + 1) << PAGE_SHIFT; - return PTR_ERR(page); + return PTR_ERR(kaddr); } - de = ((struct qnx6_dir_entry *)page_address(page)) + start; - for (; i < limit; i++, de++, ctx->pos += QNX6_DIR_ENTRY_SIZE) { + de = (struct qnx6_dir_entry *)(kaddr + offset); + limit = kaddr + last_entry(inode, n); + for (; (char *)de < limit; de++, ctx->pos += QNX6_DIR_ENTRY_SIZE) { int size = de->de_size; u32 no_inode = fs32_to_cpu(sbi, de->de_inode); @@ -164,7 +167,7 @@ static int qnx6_readdir(struct file *file, struct dir_context *ctx) } } } - qnx6_put_page(page); + qnx6_put_page(&folio->page); } return 0; } @@ -215,7 +218,7 @@ unsigned qnx6_find_entry(int len, struct inode *dir, const char *name, { struct super_block *s = dir->i_sb; struct qnx6_inode_info *ei = QNX6_I(dir); - struct page *page = NULL; + struct folio *folio; unsigned long start, n; unsigned long npages = dir_pages(dir); unsigned ino; @@ -232,12 +235,11 @@ unsigned qnx6_find_entry(int len, struct inode *dir, const char *name, n = start; do { - page = qnx6_get_page(dir, n); - if (!IS_ERR(page)) { + de = qnx6_get_folio(dir, n, &folio); + if (!IS_ERR(de)) { int limit = last_entry(dir, n); int i; - de = (struct qnx6_dir_entry *)page_address(page); for (i = 0; i < limit; i++, de++) { if (len <= QNX6_SHORT_NAME_MAX) { /* short filename */ @@ -256,7 +258,7 @@ unsigned qnx6_find_entry(int len, struct inode *dir, const char *name, } else pr_err("undefined filename size in inode.\n"); } - qnx6_put_page(page); + qnx6_put_page(&folio->page); } if (++n >= npages) @@ -265,7 +267,7 @@ unsigned qnx6_find_entry(int len, struct inode *dir, const char *name, return 0; found: - *res_page = page; + *res_page = &folio->page; ei->i_dir_start_lookup = n; return ino; } From 5563040e852994c4ad40b189d3ea4ed253ace232 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 9 Jul 2024 23:13:18 -0400 Subject: [PATCH 020/168] qnx6: Convert qnx6_find_entry() to qnx6_find_ino() It's hard to return a directory entry from qnx6_find_entry() because it might be a long dir_entry with a different format. So stick with the convention of returning an inode number, but rename it to qnx6_find_ino() to reflect what it actually does, and move the call to qnx6_put_page() inside the function which lets us get rid of the res_page parameter. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/qnx6/dir.c | 7 ++----- fs/qnx6/namei.c | 4 +--- fs/qnx6/qnx6.h | 3 +-- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c index bc7f20dda579..daf9a335ba22 100644 --- a/fs/qnx6/dir.c +++ b/fs/qnx6/dir.c @@ -213,8 +213,7 @@ static unsigned qnx6_match(struct super_block *s, int len, const char *name, } -unsigned qnx6_find_entry(int len, struct inode *dir, const char *name, - struct page **res_page) +unsigned qnx6_find_ino(int len, struct inode *dir, const char *name) { struct super_block *s = dir->i_sb; struct qnx6_inode_info *ei = QNX6_I(dir); @@ -225,8 +224,6 @@ unsigned qnx6_find_entry(int len, struct inode *dir, const char *name, struct qnx6_dir_entry *de; struct qnx6_long_dir_entry *lde; - *res_page = NULL; - if (npages == 0) return 0; start = ei->i_dir_start_lookup; @@ -267,8 +264,8 @@ unsigned qnx6_find_entry(int len, struct inode *dir, const char *name, return 0; found: - *res_page = &folio->page; ei->i_dir_start_lookup = n; + qnx6_put_page(&folio->page); return ino; } diff --git a/fs/qnx6/namei.c b/fs/qnx6/namei.c index e2e98e653b8d..0f0755a9ecb5 100644 --- a/fs/qnx6/namei.c +++ b/fs/qnx6/namei.c @@ -17,7 +17,6 @@ struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { unsigned ino; - struct page *page; struct inode *foundinode = NULL; const char *name = dentry->d_name.name; int len = dentry->d_name.len; @@ -25,10 +24,9 @@ struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry, if (len > QNX6_LONG_NAME_MAX) return ERR_PTR(-ENAMETOOLONG); - ino = qnx6_find_entry(len, dir, name, &page); + ino = qnx6_find_ino(len, dir, name); if (ino) { foundinode = qnx6_iget(dir->i_sb, ino); - qnx6_put_page(page); if (IS_ERR(foundinode)) pr_debug("lookup->iget -> error %ld\n", PTR_ERR(foundinode)); diff --git a/fs/qnx6/qnx6.h b/fs/qnx6/qnx6.h index 34a6b126a3a9..43da5f91c3ff 100644 --- a/fs/qnx6/qnx6.h +++ b/fs/qnx6/qnx6.h @@ -132,5 +132,4 @@ static inline void qnx6_put_page(struct page *page) put_page(page); } -extern unsigned qnx6_find_entry(int len, struct inode *dir, const char *name, - struct page **res_page); +unsigned qnx6_find_ino(int len, struct inode *dir, const char *name); From b2f2454c3662ebdfe8b3ebe6fc247f2daf384bf8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 9 Jul 2024 23:22:15 -0400 Subject: [PATCH 021/168] qnx6: Convert qnx6_longname() to take a folio Removes a conversion from folio to page. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/qnx6/dir.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c index daf9a335ba22..0de2a047a174 100644 --- a/fs/qnx6/dir.c +++ b/fs/qnx6/dir.c @@ -46,19 +46,20 @@ static unsigned last_entry(struct inode *inode, unsigned long page_nr) static struct qnx6_long_filename *qnx6_longname(struct super_block *sb, struct qnx6_long_dir_entry *de, - struct page **p) + struct folio **foliop) { struct qnx6_sb_info *sbi = QNX6_SB(sb); u32 s = fs32_to_cpu(sbi, de->de_long_inode); /* in block units */ u32 n = s >> (PAGE_SHIFT - sb->s_blocksize_bits); /* in pages */ - /* within page */ - u32 offs = (s << sb->s_blocksize_bits) & ~PAGE_MASK; + u32 offs; struct address_space *mapping = sbi->longfile->i_mapping; - struct page *page = read_mapping_page(mapping, n, NULL); - if (IS_ERR(page)) - return ERR_CAST(page); - kmap(*p = page); - return (struct qnx6_long_filename *)(page_address(page) + offs); + struct folio *folio = read_mapping_folio(mapping, n, NULL); + + if (IS_ERR(folio)) + return ERR_CAST(folio); + offs = offset_in_folio(folio, s << sb->s_blocksize_bits); + *foliop = folio; + return kmap(&folio->page) + offs; } static int qnx6_dir_longfilename(struct inode *inode, @@ -69,7 +70,7 @@ static int qnx6_dir_longfilename(struct inode *inode, struct qnx6_long_filename *lf; struct super_block *s = inode->i_sb; struct qnx6_sb_info *sbi = QNX6_SB(s); - struct page *page; + struct folio *folio; int lf_size; if (de->de_size != 0xff) { @@ -78,7 +79,7 @@ static int qnx6_dir_longfilename(struct inode *inode, pr_err("invalid direntry size (%i).\n", de->de_size); return 0; } - lf = qnx6_longname(s, de, &page); + lf = qnx6_longname(s, de, &folio); if (IS_ERR(lf)) { pr_err("Error reading longname\n"); return 0; @@ -89,7 +90,7 @@ static int qnx6_dir_longfilename(struct inode *inode, if (lf_size > QNX6_LONG_NAME_MAX) { pr_debug("file %s\n", lf->lf_fname); pr_err("Filename too long (%i)\n", lf_size); - qnx6_put_page(page); + qnx6_put_page(&folio->page); return 0; } @@ -102,11 +103,11 @@ static int qnx6_dir_longfilename(struct inode *inode, pr_debug("qnx6_readdir:%.*s inode:%u\n", lf_size, lf->lf_fname, de_inode); if (!dir_emit(ctx, lf->lf_fname, lf_size, de_inode, DT_UNKNOWN)) { - qnx6_put_page(page); + qnx6_put_page(&folio->page); return 0; } - qnx6_put_page(page); + qnx6_put_page(&folio->page); /* success */ return 1; } @@ -180,23 +181,23 @@ static unsigned qnx6_long_match(int len, const char *name, { struct super_block *s = dir->i_sb; struct qnx6_sb_info *sbi = QNX6_SB(s); - struct page *page; + struct folio *folio; int thislen; - struct qnx6_long_filename *lf = qnx6_longname(s, de, &page); + struct qnx6_long_filename *lf = qnx6_longname(s, de, &folio); if (IS_ERR(lf)) return 0; thislen = fs16_to_cpu(sbi, lf->lf_size); if (len != thislen) { - qnx6_put_page(page); + qnx6_put_page(&folio->page); return 0; } if (memcmp(name, lf->lf_fname, len) == 0) { - qnx6_put_page(page); + qnx6_put_page(&folio->page); return fs32_to_cpu(sbi, de->de_inode); } - qnx6_put_page(page); + qnx6_put_page(&folio->page); return 0; } From 29c42e8b8a7813479961f868fb4189295943c07d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 10 Jul 2024 08:22:12 -0400 Subject: [PATCH 022/168] qnx6: Convert qnx6_checkroot() to use a folio Removes a use of kmap and removes a conversion from folio to page. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/qnx6/inode.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 4f1735b882b1..6adee850278e 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -184,17 +184,17 @@ static const char *qnx6_checkroot(struct super_block *s) struct qnx6_dir_entry *dir_entry; struct inode *root = d_inode(s->s_root); struct address_space *mapping = root->i_mapping; - struct page *page = read_mapping_page(mapping, 0, NULL); - if (IS_ERR(page)) + struct folio *folio = read_mapping_folio(mapping, 0, NULL); + + if (IS_ERR(folio)) return "error reading root directory"; - kmap(page); - dir_entry = page_address(page); + dir_entry = kmap_local_folio(folio, 0); for (i = 0; i < 2; i++) { /* maximum 3 bytes - due to match_root limitation */ if (strncmp(dir_entry[i].de_fname, match_root[i], 3)) error = 1; } - qnx6_put_page(page); + folio_release_kmap(folio, dir_entry); if (error) return "error reading root directory."; return NULL; From 1d49228f2762740be986886286f7683c6ae30f6d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 10 Jul 2024 08:30:23 -0400 Subject: [PATCH 023/168] qnx6: Convert qnx6_iget() to use a folio Removes a use of kmap() and a couple of conversions between folios and pages. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/qnx6/inode.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 6adee850278e..85925ec0051a 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -518,7 +518,7 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino) struct inode *inode; struct qnx6_inode_info *ei; struct address_space *mapping; - struct page *page; + struct folio *folio; u32 n, offs; inode = iget_locked(sb, ino); @@ -538,17 +538,16 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino) return ERR_PTR(-EIO); } n = (ino - 1) >> (PAGE_SHIFT - QNX6_INODE_SIZE_BITS); - offs = (ino - 1) & (~PAGE_MASK >> QNX6_INODE_SIZE_BITS); mapping = sbi->inodes->i_mapping; - page = read_mapping_page(mapping, n, NULL); - if (IS_ERR(page)) { + folio = read_mapping_folio(mapping, n, NULL); + if (IS_ERR(folio)) { pr_err("major problem: unable to read inode from dev %s\n", sb->s_id); iget_failed(inode); - return ERR_CAST(page); + return ERR_CAST(folio); } - kmap(page); - raw_inode = ((struct qnx6_inode_entry *)page_address(page)) + offs; + offs = offset_in_folio(folio, (ino - 1) << QNX6_INODE_SIZE_BITS); + raw_inode = kmap_local_folio(folio, offs); inode->i_mode = fs16_to_cpu(sbi, raw_inode->di_mode); i_uid_write(inode, (uid_t)fs32_to_cpu(sbi, raw_inode->di_uid)); @@ -578,7 +577,7 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino) inode->i_mapping->a_ops = &qnx6_aops; } else init_special_inode(inode, inode->i_mode, 0); - qnx6_put_page(page); + folio_release_kmap(folio, raw_inode); unlock_new_inode(inode); return inode; } From 25689405bc33788de71d39d9900f849aa8bd9040 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 10 Jul 2024 11:31:09 -0400 Subject: [PATCH 024/168] qnx6: Convert directory handling to use kmap_local Eliminates qnx6_put_page() and a few hidden calls to compound_head(). Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/qnx6/dir.c | 22 +++++++++++----------- fs/qnx6/qnx6.h | 6 ------ 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c index 0de2a047a174..b4d10e45f2e4 100644 --- a/fs/qnx6/dir.c +++ b/fs/qnx6/dir.c @@ -32,7 +32,7 @@ static void *qnx6_get_folio(struct inode *dir, unsigned long n, if (IS_ERR(folio)) return folio; *foliop = folio; - return kmap(&folio->page); + return kmap_local_folio(folio, 0); } static unsigned last_entry(struct inode *inode, unsigned long page_nr) @@ -59,7 +59,7 @@ static struct qnx6_long_filename *qnx6_longname(struct super_block *sb, return ERR_CAST(folio); offs = offset_in_folio(folio, s << sb->s_blocksize_bits); *foliop = folio; - return kmap(&folio->page) + offs; + return kmap_local_folio(folio, offs); } static int qnx6_dir_longfilename(struct inode *inode, @@ -90,7 +90,7 @@ static int qnx6_dir_longfilename(struct inode *inode, if (lf_size > QNX6_LONG_NAME_MAX) { pr_debug("file %s\n", lf->lf_fname); pr_err("Filename too long (%i)\n", lf_size); - qnx6_put_page(&folio->page); + folio_release_kmap(folio, lf); return 0; } @@ -103,11 +103,11 @@ static int qnx6_dir_longfilename(struct inode *inode, pr_debug("qnx6_readdir:%.*s inode:%u\n", lf_size, lf->lf_fname, de_inode); if (!dir_emit(ctx, lf->lf_fname, lf_size, de_inode, DT_UNKNOWN)) { - qnx6_put_page(&folio->page); + folio_release_kmap(folio, lf); return 0; } - qnx6_put_page(&folio->page); + folio_release_kmap(folio, lf); /* success */ return 1; } @@ -168,7 +168,7 @@ static int qnx6_readdir(struct file *file, struct dir_context *ctx) } } } - qnx6_put_page(&folio->page); + folio_release_kmap(folio, kaddr); } return 0; } @@ -190,14 +190,14 @@ static unsigned qnx6_long_match(int len, const char *name, thislen = fs16_to_cpu(sbi, lf->lf_size); if (len != thislen) { - qnx6_put_page(&folio->page); + folio_release_kmap(folio, lf); return 0; } if (memcmp(name, lf->lf_fname, len) == 0) { - qnx6_put_page(&folio->page); + folio_release_kmap(folio, lf); return fs32_to_cpu(sbi, de->de_inode); } - qnx6_put_page(&folio->page); + folio_release_kmap(folio, lf); return 0; } @@ -256,7 +256,7 @@ unsigned qnx6_find_ino(int len, struct inode *dir, const char *name) } else pr_err("undefined filename size in inode.\n"); } - qnx6_put_page(&folio->page); + folio_release_kmap(folio, de - i); } if (++n >= npages) @@ -266,7 +266,7 @@ unsigned qnx6_find_ino(int len, struct inode *dir, const char *name) found: ei->i_dir_start_lookup = n; - qnx6_put_page(&folio->page); + folio_release_kmap(folio, de); return ino; } diff --git a/fs/qnx6/qnx6.h b/fs/qnx6/qnx6.h index 43da5f91c3ff..56ed1367499e 100644 --- a/fs/qnx6/qnx6.h +++ b/fs/qnx6/qnx6.h @@ -126,10 +126,4 @@ static inline __fs16 cpu_to_fs16(struct qnx6_sb_info *sbi, __u16 n) extern struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent); -static inline void qnx6_put_page(struct page *page) -{ - kunmap(page); - put_page(page); -} - unsigned qnx6_find_ino(int len, struct inode *dir, const char *name); From 9cf2de3ddaeb2d70ea21448104cc604199da2acc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 9 Jul 2024 15:05:39 -0400 Subject: [PATCH 025/168] minixfs: Convert dir_get_page() to dir_get_folio() Remove a few conversions between page and folio. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/minix/dir.c | 66 ++++++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 31 deletions(-) diff --git a/fs/minix/dir.c b/fs/minix/dir.c index a224cf222570..41e6c0c2e243 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -64,14 +64,15 @@ static int minix_handle_dirsync(struct inode *dir) return err; } -static void *dir_get_page(struct inode *dir, unsigned long n, struct page **p) +static void *dir_get_folio(struct inode *dir, unsigned long n, + struct folio **foliop) { - struct address_space *mapping = dir->i_mapping; - struct page *page = read_mapping_page(mapping, n, NULL); - if (IS_ERR(page)) - return ERR_CAST(page); - *p = page; - return kmap_local_page(page); + struct folio *folio = read_mapping_folio(dir->i_mapping, n, NULL); + + if (IS_ERR(folio)) + return ERR_CAST(folio); + *foliop = folio; + return kmap_local_folio(folio, 0); } static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi) @@ -99,9 +100,9 @@ static int minix_readdir(struct file *file, struct dir_context *ctx) for ( ; n < npages; n++, offset = 0) { char *p, *kaddr, *limit; - struct page *page; + struct folio *folio; - kaddr = dir_get_page(inode, n, &page); + kaddr = dir_get_folio(inode, n, &folio); if (IS_ERR(kaddr)) continue; p = kaddr+offset; @@ -122,13 +123,13 @@ static int minix_readdir(struct file *file, struct dir_context *ctx) unsigned l = strnlen(name, sbi->s_namelen); if (!dir_emit(ctx, name, l, inumber, DT_UNKNOWN)) { - unmap_and_put_page(page, p); + folio_release_kmap(folio, p); return 0; } } ctx->pos += chunk_size; } - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); } return 0; } @@ -158,7 +159,7 @@ minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page) struct minix_sb_info * sbi = minix_sb(sb); unsigned long n; unsigned long npages = dir_pages(dir); - struct page *page = NULL; + struct folio *folio = NULL; char *p; char *namx; @@ -168,7 +169,7 @@ minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page) for (n = 0; n < npages; n++) { char *kaddr, *limit; - kaddr = dir_get_page(dir, n, &page); + kaddr = dir_get_folio(dir, n, &folio); if (IS_ERR(kaddr)) continue; @@ -188,12 +189,12 @@ minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page) if (namecompare(namelen, sbi->s_namelen, name, namx)) goto found; } - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); } return NULL; found: - *res_page = page; + *res_page = &folio->page; return (minix_dirent *)p; } @@ -204,7 +205,7 @@ int minix_add_link(struct dentry *dentry, struct inode *inode) int namelen = dentry->d_name.len; struct super_block * sb = dir->i_sb; struct minix_sb_info * sbi = minix_sb(sb); - struct page *page = NULL; + struct folio *folio = NULL; unsigned long npages = dir_pages(dir); unsigned long n; char *kaddr, *p; @@ -223,10 +224,10 @@ int minix_add_link(struct dentry *dentry, struct inode *inode) for (n = 0; n <= npages; n++) { char *limit, *dir_end; - kaddr = dir_get_page(dir, n, &page); + kaddr = dir_get_folio(dir, n, &folio); if (IS_ERR(kaddr)) return PTR_ERR(kaddr); - lock_page(page); + folio_lock(folio); dir_end = kaddr + minix_last_byte(dir, n); limit = kaddr + PAGE_SIZE - sbi->s_dirsize; for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) { @@ -253,15 +254,15 @@ int minix_add_link(struct dentry *dentry, struct inode *inode) if (namecompare(namelen, sbi->s_namelen, name, namx)) goto out_unlock; } - unlock_page(page); - unmap_and_put_page(page, kaddr); + folio_unlock(folio); + folio_release_kmap(folio, kaddr); } BUG(); return -EINVAL; got_it: - pos = page_offset(page) + offset_in_page(p); - err = minix_prepare_chunk(page, pos, sbi->s_dirsize); + pos = folio_pos(folio) + offset_in_folio(folio, p); + err = minix_prepare_chunk(&folio->page, pos, sbi->s_dirsize); if (err) goto out_unlock; memcpy (namx, name, namelen); @@ -272,15 +273,15 @@ got_it: memset (namx + namelen, 0, sbi->s_dirsize - namelen - 2); de->inode = inode->i_ino; } - dir_commit_chunk(page, pos, sbi->s_dirsize); + dir_commit_chunk(&folio->page, pos, sbi->s_dirsize); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); err = minix_handle_dirsync(dir); out_put: - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); return err; out_unlock: - unlock_page(page); + folio_unlock(folio); goto out_put; } @@ -357,7 +358,7 @@ fail: */ int minix_empty_dir(struct inode * inode) { - struct page *page = NULL; + struct folio *folio = NULL; unsigned long i, npages = dir_pages(inode); struct minix_sb_info *sbi = minix_sb(inode->i_sb); char *name, *kaddr; @@ -366,7 +367,7 @@ int minix_empty_dir(struct inode * inode) for (i = 0; i < npages; i++) { char *p, *limit; - kaddr = dir_get_page(inode, i, &page); + kaddr = dir_get_folio(inode, i, &folio); if (IS_ERR(kaddr)) continue; @@ -395,12 +396,12 @@ int minix_empty_dir(struct inode * inode) goto not_empty; } } - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); } return 1; not_empty: - unmap_and_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); return 0; } @@ -431,11 +432,14 @@ int minix_set_link(struct minix_dir_entry *de, struct page *page, struct minix_dir_entry * minix_dotdot (struct inode *dir, struct page **p) { + struct folio *folio; struct minix_sb_info *sbi = minix_sb(dir->i_sb); - struct minix_dir_entry *de = dir_get_page(dir, 0, p); + struct minix_dir_entry *de = dir_get_folio(dir, 0, &folio); - if (!IS_ERR(de)) + if (!IS_ERR(de)) { + *p = &folio->page; return minix_next_entry(de, sbi); + } return NULL; } From 5a77670ff8634f12636db6b5fd0a4ec5d548ef64 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 9 Jul 2024 16:09:32 -0400 Subject: [PATCH 026/168] minixfs: Convert minix_find_entry() to take a folio Remove a few hidden calls to compound_head(). Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/minix/dir.c | 25 +++++++++++-------------- fs/minix/minix.h | 14 +++++++------- fs/minix/namei.c | 24 ++++++++++++------------ 3 files changed, 30 insertions(+), 33 deletions(-) diff --git a/fs/minix/dir.c b/fs/minix/dir.c index 41e6c0c2e243..3bbfac32d520 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -145,12 +145,13 @@ static inline int namecompare(int len, int maxlen, /* * minix_find_entry() * - * finds an entry in the specified directory with the wanted name. It - * returns the cache buffer in which the entry was found, and the entry - * itself (as a parameter - res_dir). It does NOT read the inode of the + * finds an entry in the specified directory with the wanted name. + * It does NOT read the inode of the * entry - you'll have to do that yourself if you want to. + * + * On Success folio_release_kmap() should be called on *foliop. */ -minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page) +minix_dirent *minix_find_entry(struct dentry *dentry, struct folio **foliop) { const char * name = dentry->d_name.name; int namelen = dentry->d_name.len; @@ -159,17 +160,15 @@ minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page) struct minix_sb_info * sbi = minix_sb(sb); unsigned long n; unsigned long npages = dir_pages(dir); - struct folio *folio = NULL; char *p; char *namx; __u32 inumber; - *res_page = NULL; for (n = 0; n < npages; n++) { char *kaddr, *limit; - kaddr = dir_get_folio(dir, n, &folio); + kaddr = dir_get_folio(dir, n, foliop); if (IS_ERR(kaddr)) continue; @@ -189,12 +188,11 @@ minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page) if (namecompare(namelen, sbi->s_namelen, name, namx)) goto found; } - folio_release_kmap(folio, kaddr); + folio_release_kmap(*foliop, kaddr); } return NULL; found: - *res_page = &folio->page; return (minix_dirent *)p; } @@ -445,20 +443,19 @@ struct minix_dir_entry * minix_dotdot (struct inode *dir, struct page **p) ino_t minix_inode_by_name(struct dentry *dentry) { - struct page *page; - struct minix_dir_entry *de = minix_find_entry(dentry, &page); + struct folio *folio; + struct minix_dir_entry *de = minix_find_entry(dentry, &folio); ino_t res = 0; if (de) { - struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; + struct inode *inode = folio->mapping->host; struct minix_sb_info *sbi = minix_sb(inode->i_sb); if (sbi->s_version == MINIX_V3) res = ((minix3_dirent *) de)->inode; else res = de->inode; - unmap_and_put_page(page, de); + folio_release_kmap(folio, de); } return res; } diff --git a/fs/minix/minix.h b/fs/minix/minix.h index d493507c064f..a290dd483e69 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -64,15 +64,15 @@ extern int V2_minix_get_block(struct inode *, long, struct buffer_head *, int); extern unsigned V1_minix_blocks(loff_t, struct super_block *); extern unsigned V2_minix_blocks(loff_t, struct super_block *); -extern struct minix_dir_entry *minix_find_entry(struct dentry*, struct page**); -extern int minix_add_link(struct dentry*, struct inode*); -extern int minix_delete_entry(struct minix_dir_entry*, struct page*); -extern int minix_make_empty(struct inode*, struct inode*); -extern int minix_empty_dir(struct inode*); +struct minix_dir_entry *minix_find_entry(struct dentry *, struct folio **); +int minix_add_link(struct dentry*, struct inode*); +int minix_delete_entry(struct minix_dir_entry*, struct page*); +int minix_make_empty(struct inode*, struct inode*); +int minix_empty_dir(struct inode*); int minix_set_link(struct minix_dir_entry *de, struct page *page, struct inode *inode); -extern struct minix_dir_entry *minix_dotdot(struct inode*, struct page**); -extern ino_t minix_inode_by_name(struct dentry*); +struct minix_dir_entry *minix_dotdot(struct inode*, struct page**); +ino_t minix_inode_by_name(struct dentry*); extern const struct inode_operations minix_file_inode_operations; extern const struct inode_operations minix_dir_inode_operations; diff --git a/fs/minix/namei.c b/fs/minix/namei.c index a944a0f17b53..117264877bd7 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -141,15 +141,15 @@ out_fail: static int minix_unlink(struct inode * dir, struct dentry *dentry) { struct inode * inode = d_inode(dentry); - struct page * page; + struct folio *folio; struct minix_dir_entry * de; int err; - de = minix_find_entry(dentry, &page); + de = minix_find_entry(dentry, &folio); if (!de) return -ENOENT; - err = minix_delete_entry(de, page); - unmap_and_put_page(page, de); + err = minix_delete_entry(de, &folio->page); + folio_release_kmap(folio, de); if (err) return err; @@ -182,14 +182,14 @@ static int minix_rename(struct mnt_idmap *idmap, struct inode * new_inode = d_inode(new_dentry); struct page * dir_page = NULL; struct minix_dir_entry * dir_de = NULL; - struct page * old_page; + struct folio *old_folio; struct minix_dir_entry * old_de; int err = -ENOENT; if (flags & ~RENAME_NOREPLACE) return -EINVAL; - old_de = minix_find_entry(old_dentry, &old_page); + old_de = minix_find_entry(old_dentry, &old_folio); if (!old_de) goto out; @@ -201,7 +201,7 @@ static int minix_rename(struct mnt_idmap *idmap, } if (new_inode) { - struct page * new_page; + struct folio *new_folio; struct minix_dir_entry * new_de; err = -ENOTEMPTY; @@ -209,11 +209,11 @@ static int minix_rename(struct mnt_idmap *idmap, goto out_dir; err = -ENOENT; - new_de = minix_find_entry(new_dentry, &new_page); + new_de = minix_find_entry(new_dentry, &new_folio); if (!new_de) goto out_dir; - err = minix_set_link(new_de, new_page, old_inode); - unmap_and_put_page(new_page, new_de); + err = minix_set_link(new_de, &new_folio->page, old_inode); + folio_release_kmap(new_folio, new_de); if (err) goto out_dir; inode_set_ctime_current(new_inode); @@ -228,7 +228,7 @@ static int minix_rename(struct mnt_idmap *idmap, inode_inc_link_count(new_dir); } - err = minix_delete_entry(old_de, old_page); + err = minix_delete_entry(old_de, &old_folio->page); if (err) goto out_dir; @@ -243,7 +243,7 @@ out_dir: if (dir_de) unmap_and_put_page(dir_page, dir_de); out_old: - unmap_and_put_page(old_page, old_de); + folio_release_kmap(old_folio, old_de); out: return err; } From 6e9ead1ec9f8067178d8db005470242bc5375e7f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 9 Jul 2024 20:37:59 -0400 Subject: [PATCH 027/168] minixfs: Convert minix_set_link() and minix_dotdot() to take a folio This matches ext2 and removes a few hidden calls to compound_head(). Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/minix/dir.c | 23 ++++++++++------------- fs/minix/minix.h | 4 ++-- fs/minix/namei.c | 10 +++++----- 3 files changed, 17 insertions(+), 20 deletions(-) diff --git a/fs/minix/dir.c b/fs/minix/dir.c index 3bbfac32d520..c32d182c2d74 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -404,40 +404,37 @@ not_empty: } /* Releases the page */ -int minix_set_link(struct minix_dir_entry *de, struct page *page, +int minix_set_link(struct minix_dir_entry *de, struct folio *folio, struct inode *inode) { - struct inode *dir = page->mapping->host; + struct inode *dir = folio->mapping->host; struct minix_sb_info *sbi = minix_sb(dir->i_sb); - loff_t pos = page_offset(page) + offset_in_page(de); + loff_t pos = folio_pos(folio) + offset_in_folio(folio, de); int err; - lock_page(page); - err = minix_prepare_chunk(page, pos, sbi->s_dirsize); + folio_lock(folio); + err = minix_prepare_chunk(&folio->page, pos, sbi->s_dirsize); if (err) { - unlock_page(page); + folio_unlock(folio); return err; } if (sbi->s_version == MINIX_V3) ((minix3_dirent *)de)->inode = inode->i_ino; else de->inode = inode->i_ino; - dir_commit_chunk(page, pos, sbi->s_dirsize); + dir_commit_chunk(&folio->page, pos, sbi->s_dirsize); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); return minix_handle_dirsync(dir); } -struct minix_dir_entry * minix_dotdot (struct inode *dir, struct page **p) +struct minix_dir_entry *minix_dotdot(struct inode *dir, struct folio **foliop) { - struct folio *folio; struct minix_sb_info *sbi = minix_sb(dir->i_sb); - struct minix_dir_entry *de = dir_get_folio(dir, 0, &folio); + struct minix_dir_entry *de = dir_get_folio(dir, 0, foliop); - if (!IS_ERR(de)) { - *p = &folio->page; + if (!IS_ERR(de)) return minix_next_entry(de, sbi); - } return NULL; } diff --git a/fs/minix/minix.h b/fs/minix/minix.h index a290dd483e69..6ed34209ed33 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -69,9 +69,9 @@ int minix_add_link(struct dentry*, struct inode*); int minix_delete_entry(struct minix_dir_entry*, struct page*); int minix_make_empty(struct inode*, struct inode*); int minix_empty_dir(struct inode*); -int minix_set_link(struct minix_dir_entry *de, struct page *page, +int minix_set_link(struct minix_dir_entry *de, struct folio *folio, struct inode *inode); -struct minix_dir_entry *minix_dotdot(struct inode*, struct page**); +struct minix_dir_entry *minix_dotdot(struct inode*, struct folio **); ino_t minix_inode_by_name(struct dentry*); extern const struct inode_operations minix_file_inode_operations; diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 117264877bd7..ba82fa3332f1 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -180,7 +180,7 @@ static int minix_rename(struct mnt_idmap *idmap, { struct inode * old_inode = d_inode(old_dentry); struct inode * new_inode = d_inode(new_dentry); - struct page * dir_page = NULL; + struct folio * dir_folio = NULL; struct minix_dir_entry * dir_de = NULL; struct folio *old_folio; struct minix_dir_entry * old_de; @@ -195,7 +195,7 @@ static int minix_rename(struct mnt_idmap *idmap, if (S_ISDIR(old_inode->i_mode)) { err = -EIO; - dir_de = minix_dotdot(old_inode, &dir_page); + dir_de = minix_dotdot(old_inode, &dir_folio); if (!dir_de) goto out_old; } @@ -212,7 +212,7 @@ static int minix_rename(struct mnt_idmap *idmap, new_de = minix_find_entry(new_dentry, &new_folio); if (!new_de) goto out_dir; - err = minix_set_link(new_de, &new_folio->page, old_inode); + err = minix_set_link(new_de, new_folio, old_inode); folio_release_kmap(new_folio, new_de); if (err) goto out_dir; @@ -235,13 +235,13 @@ static int minix_rename(struct mnt_idmap *idmap, mark_inode_dirty(old_inode); if (dir_de) { - err = minix_set_link(dir_de, dir_page, new_dir); + err = minix_set_link(dir_de, dir_folio, new_dir); if (!err) inode_dec_link_count(old_dir); } out_dir: if (dir_de) - unmap_and_put_page(dir_page, dir_de); + folio_release_kmap(dir_folio, dir_de); out_old: folio_release_kmap(old_folio, old_de); out: From e033fe609d749749ba40554f6c312f9b29b28447 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 9 Jul 2024 20:40:11 -0400 Subject: [PATCH 028/168] minixfs: Convert minix_delete_entry() to work on a folio Match ext2 and remove a few hidden calls to compound_head(). Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/minix/dir.c | 14 +++++++------- fs/minix/minix.h | 2 +- fs/minix/namei.c | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/minix/dir.c b/fs/minix/dir.c index c32d182c2d74..994bbbd3dea2 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -283,25 +283,25 @@ out_unlock: goto out_put; } -int minix_delete_entry(struct minix_dir_entry *de, struct page *page) +int minix_delete_entry(struct minix_dir_entry *de, struct folio *folio) { - struct inode *inode = page->mapping->host; - loff_t pos = page_offset(page) + offset_in_page(de); + struct inode *inode = folio->mapping->host; + loff_t pos = folio_pos(folio) + offset_in_folio(folio, de); struct minix_sb_info *sbi = minix_sb(inode->i_sb); unsigned len = sbi->s_dirsize; int err; - lock_page(page); - err = minix_prepare_chunk(page, pos, len); + folio_lock(folio); + err = minix_prepare_chunk(&folio->page, pos, len); if (err) { - unlock_page(page); + folio_unlock(folio); return err; } if (sbi->s_version == MINIX_V3) ((minix3_dirent *)de)->inode = 0; else de->inode = 0; - dir_commit_chunk(page, pos, len); + dir_commit_chunk(&folio->page, pos, len); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); return minix_handle_dirsync(inode); diff --git a/fs/minix/minix.h b/fs/minix/minix.h index 6ed34209ed33..063bab8faa6b 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -66,7 +66,7 @@ extern unsigned V2_minix_blocks(loff_t, struct super_block *); struct minix_dir_entry *minix_find_entry(struct dentry *, struct folio **); int minix_add_link(struct dentry*, struct inode*); -int minix_delete_entry(struct minix_dir_entry*, struct page*); +int minix_delete_entry(struct minix_dir_entry *, struct folio *); int minix_make_empty(struct inode*, struct inode*); int minix_empty_dir(struct inode*); int minix_set_link(struct minix_dir_entry *de, struct folio *folio, diff --git a/fs/minix/namei.c b/fs/minix/namei.c index ba82fa3332f1..5d9c1406fe27 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -148,7 +148,7 @@ static int minix_unlink(struct inode * dir, struct dentry *dentry) de = minix_find_entry(dentry, &folio); if (!de) return -ENOENT; - err = minix_delete_entry(de, &folio->page); + err = minix_delete_entry(de, folio); folio_release_kmap(folio, de); if (err) @@ -228,7 +228,7 @@ static int minix_rename(struct mnt_idmap *idmap, inode_inc_link_count(new_dir); } - err = minix_delete_entry(old_de, &old_folio->page); + err = minix_delete_entry(old_de, old_folio); if (err) goto out_dir; From da2c04c15068adaf4a5cce0f47fd7aad97ac8c4f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 9 Jul 2024 20:42:38 -0400 Subject: [PATCH 029/168] minixfs: Convert minix_make_empty() to use a folio Removes a few hidden calls to compound_head(). Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/minix/dir.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/minix/dir.c b/fs/minix/dir.c index 994bbbd3dea2..15b3ef1e473c 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -309,21 +309,21 @@ int minix_delete_entry(struct minix_dir_entry *de, struct folio *folio) int minix_make_empty(struct inode *inode, struct inode *dir) { - struct page *page = grab_cache_page(inode->i_mapping, 0); + struct folio *folio = filemap_grab_folio(inode->i_mapping, 0); struct minix_sb_info *sbi = minix_sb(inode->i_sb); char *kaddr; int err; - if (!page) - return -ENOMEM; - err = minix_prepare_chunk(page, 0, 2 * sbi->s_dirsize); + if (IS_ERR(folio)) + return PTR_ERR(folio); + err = minix_prepare_chunk(&folio->page, 0, 2 * sbi->s_dirsize); if (err) { - unlock_page(page); + folio_unlock(folio); goto fail; } - kaddr = kmap_local_page(page); - memset(kaddr, 0, PAGE_SIZE); + kaddr = kmap_local_folio(folio, 0); + memset(kaddr, 0, folio_size(folio)); if (sbi->s_version == MINIX_V3) { minix3_dirent *de3 = (minix3_dirent *)kaddr; @@ -344,10 +344,10 @@ int minix_make_empty(struct inode *inode, struct inode *dir) } kunmap_local(kaddr); - dir_commit_chunk(page, 0, 2 * sbi->s_dirsize); + dir_commit_chunk(&folio->page, 0, 2 * sbi->s_dirsize); err = minix_handle_dirsync(inode); fail: - put_page(page); + folio_put(folio); return err; } From cf04e47128afd071bcb9d0c9806dce835f85356d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 9 Jul 2024 20:59:00 -0400 Subject: [PATCH 030/168] minixfs: Convert minix_prepare_chunk() to take a folio All callers now have a folio, so convert minix_prepare_chunk() to take one. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/minix/dir.c | 8 ++++---- fs/minix/inode.c | 4 ++-- fs/minix/minix.h | 24 ++++++++++++------------ 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/fs/minix/dir.c b/fs/minix/dir.c index 15b3ef1e473c..26bfea508028 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -260,7 +260,7 @@ int minix_add_link(struct dentry *dentry, struct inode *inode) got_it: pos = folio_pos(folio) + offset_in_folio(folio, p); - err = minix_prepare_chunk(&folio->page, pos, sbi->s_dirsize); + err = minix_prepare_chunk(folio, pos, sbi->s_dirsize); if (err) goto out_unlock; memcpy (namx, name, namelen); @@ -292,7 +292,7 @@ int minix_delete_entry(struct minix_dir_entry *de, struct folio *folio) int err; folio_lock(folio); - err = minix_prepare_chunk(&folio->page, pos, len); + err = minix_prepare_chunk(folio, pos, len); if (err) { folio_unlock(folio); return err; @@ -316,7 +316,7 @@ int minix_make_empty(struct inode *inode, struct inode *dir) if (IS_ERR(folio)) return PTR_ERR(folio); - err = minix_prepare_chunk(&folio->page, 0, 2 * sbi->s_dirsize); + err = minix_prepare_chunk(folio, 0, 2 * sbi->s_dirsize); if (err) { folio_unlock(folio); goto fail; @@ -413,7 +413,7 @@ int minix_set_link(struct minix_dir_entry *de, struct folio *folio, int err; folio_lock(folio); - err = minix_prepare_chunk(&folio->page, pos, sbi->s_dirsize); + err = minix_prepare_chunk(folio, pos, sbi->s_dirsize); if (err) { folio_unlock(folio); return err; diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 1c3df63162ef..0002337977e0 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -427,9 +427,9 @@ static int minix_read_folio(struct file *file, struct folio *folio) return block_read_full_folio(folio, minix_get_block); } -int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len) +int minix_prepare_chunk(struct folio *folio, loff_t pos, unsigned len) { - return __block_write_begin(page, pos, len, minix_get_block); + return __block_write_begin(&folio->page, pos, len, minix_get_block); } static void minix_write_failed(struct address_space *mapping, loff_t to) diff --git a/fs/minix/minix.h b/fs/minix/minix.h index 063bab8faa6b..d54273c3c9ff 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -42,18 +42,18 @@ struct minix_sb_info { unsigned short s_version; }; -extern struct inode *minix_iget(struct super_block *, unsigned long); -extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **); -extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); -extern struct inode * minix_new_inode(const struct inode *, umode_t); -extern void minix_free_inode(struct inode * inode); -extern unsigned long minix_count_free_inodes(struct super_block *sb); -extern int minix_new_block(struct inode * inode); -extern void minix_free_block(struct inode *inode, unsigned long block); -extern unsigned long minix_count_free_blocks(struct super_block *sb); -extern int minix_getattr(struct mnt_idmap *, const struct path *, - struct kstat *, u32, unsigned int); -extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len); +struct inode *minix_iget(struct super_block *, unsigned long); +struct minix_inode *minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **); +struct minix2_inode *minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); +struct inode *minix_new_inode(const struct inode *, umode_t); +void minix_free_inode(struct inode *inode); +unsigned long minix_count_free_inodes(struct super_block *sb); +int minix_new_block(struct inode *inode); +void minix_free_block(struct inode *inode, unsigned long block); +unsigned long minix_count_free_blocks(struct super_block *sb); +int minix_getattr(struct mnt_idmap *, const struct path *, + struct kstat *, u32, unsigned int); +int minix_prepare_chunk(struct folio *folio, loff_t pos, unsigned len); extern void V1_minix_truncate(struct inode *); extern void V2_minix_truncate(struct inode *); From 0551bc716e830e36c16c70282bc4134e2f3ec821 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 9 Jul 2024 21:03:03 -0400 Subject: [PATCH 031/168] minixfs: Convert dir_commit_chunk() to take a folio All callers now have a folio, so pass it in. Saves a call to compound_head(). Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/minix/dir.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/minix/dir.c b/fs/minix/dir.c index 26bfea508028..5f9e2fc91003 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -40,18 +40,18 @@ minix_last_byte(struct inode *inode, unsigned long page_nr) return last_byte; } -static void dir_commit_chunk(struct page *page, loff_t pos, unsigned len) +static void dir_commit_chunk(struct folio *folio, loff_t pos, unsigned len) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; struct inode *dir = mapping->host; - block_write_end(NULL, mapping, pos, len, len, page, NULL); + block_write_end(NULL, mapping, pos, len, len, &folio->page, NULL); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); mark_inode_dirty(dir); } - unlock_page(page); + folio_unlock(folio); } static int minix_handle_dirsync(struct inode *dir) @@ -271,7 +271,7 @@ got_it: memset (namx + namelen, 0, sbi->s_dirsize - namelen - 2); de->inode = inode->i_ino; } - dir_commit_chunk(&folio->page, pos, sbi->s_dirsize); + dir_commit_chunk(folio, pos, sbi->s_dirsize); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); err = minix_handle_dirsync(dir); @@ -301,7 +301,7 @@ int minix_delete_entry(struct minix_dir_entry *de, struct folio *folio) ((minix3_dirent *)de)->inode = 0; else de->inode = 0; - dir_commit_chunk(&folio->page, pos, len); + dir_commit_chunk(folio, pos, len); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); return minix_handle_dirsync(inode); @@ -344,7 +344,7 @@ int minix_make_empty(struct inode *inode, struct inode *dir) } kunmap_local(kaddr); - dir_commit_chunk(&folio->page, 0, 2 * sbi->s_dirsize); + dir_commit_chunk(folio, 0, 2 * sbi->s_dirsize); err = minix_handle_dirsync(inode); fail: folio_put(folio); @@ -422,7 +422,7 @@ int minix_set_link(struct minix_dir_entry *de, struct folio *folio, ((minix3_dirent *)de)->inode = inode->i_ino; else de->inode = inode->i_ino; - dir_commit_chunk(&folio->page, pos, sbi->s_dirsize); + dir_commit_chunk(folio, pos, sbi->s_dirsize); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); return minix_handle_dirsync(dir); From 8eb835a1366f52d335aae7c2acff9c1bd57fb227 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 16 Apr 2024 23:58:48 -0400 Subject: [PATCH 032/168] fs: Convert block_write_begin() to use a folio Use the folio APIs to retrieve the folio from the page cache and manipulate it. Saves a few conversions between pages & folios. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/buffer.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index e55ad471c530..a9aeb04da398 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2225,21 +2225,22 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, get_block_t *get_block) { pgoff_t index = pos >> PAGE_SHIFT; - struct page *page; + struct folio *folio; int status; - page = grab_cache_page_write_begin(mapping, index); - if (!page) - return -ENOMEM; + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); - status = __block_write_begin(page, pos, len, get_block); + status = __block_write_begin_int(folio, pos, len, get_block, NULL); if (unlikely(status)) { - unlock_page(page); - put_page(page); - page = NULL; + folio_unlock(folio); + folio_put(folio); + folio = NULL; } - *pagep = page; + *pagep = &folio->page; return status; } EXPORT_SYMBOL(block_write_begin); From 24481ffdc0d1580ec2d968682b3c2dc4fbfd6c0b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 8 Jul 2024 11:06:01 -0400 Subject: [PATCH 033/168] reiserfs: Convert grab_tail_page() to use a folio Removes a call to grab_cache_page() and a few hidden calls to compound_head(). Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/reiserfs/inode.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 9b43a81a6488..2d558b2012ea 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2178,7 +2178,7 @@ static int grab_tail_page(struct inode *inode, unsigned long offset = (inode->i_size) & (PAGE_SIZE - 1); struct buffer_head *bh; struct buffer_head *head; - struct page *page; + struct folio *folio; int error; /* @@ -2190,20 +2190,20 @@ static int grab_tail_page(struct inode *inode, if ((offset & (blocksize - 1)) == 0) { return -ENOENT; } - page = grab_cache_page(inode->i_mapping, index); - error = -ENOMEM; - if (!page) { - goto out; - } + folio = __filemap_get_folio(inode->i_mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mapping_gfp_mask(inode->i_mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); /* start within the page of the last block in the file */ start = (offset / blocksize) * blocksize; - error = __block_write_begin(page, start, offset - start, + error = __block_write_begin(&folio->page, start, offset - start, reiserfs_get_block_create_0); if (error) goto unlock; - head = page_buffers(page); + head = folio_buffers(folio); bh = head; do { if (pos >= start) { @@ -2226,14 +2226,13 @@ static int grab_tail_page(struct inode *inode, goto unlock; } *bh_result = bh; - *page_result = page; + *page_result = &folio->page; -out: return error; unlock: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return error; } From cc67bcb2c3709e12f1a72d5a884942309c89c2ee Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 10 Jul 2024 22:50:29 -0400 Subject: [PATCH 034/168] reiserfs: Convert reiserfs_write_begin() to use a folio Remove a few calls to compound_head(). Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/reiserfs/inode.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 2d558b2012ea..33da519c9767 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2738,20 +2738,21 @@ static int reiserfs_write_begin(struct file *file, struct page **pagep, void **fsdata) { struct inode *inode; - struct page *page; + struct folio *folio; pgoff_t index; int ret; int old_ref = 0; inode = mapping->host; index = pos >> PAGE_SHIFT; - page = grab_cache_page_write_begin(mapping, index); - if (!page) - return -ENOMEM; - *pagep = page; + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); + *pagep = &folio->page; reiserfs_wait_on_write_block(inode->i_sb); - fix_tail_page_for_writing(page); + fix_tail_page_for_writing(&folio->page); if (reiserfs_transaction_running(inode->i_sb)) { struct reiserfs_transaction_handle *th; th = (struct reiserfs_transaction_handle *)current-> @@ -2761,7 +2762,7 @@ static int reiserfs_write_begin(struct file *file, old_ref = th->t_refcount; th->t_refcount++; } - ret = __block_write_begin(page, pos, len, reiserfs_get_block); + ret = __block_write_begin(&folio->page, pos, len, reiserfs_get_block); if (ret && reiserfs_transaction_running(inode->i_sb)) { struct reiserfs_transaction_handle *th = current->journal_info; /* @@ -2791,8 +2792,8 @@ static int reiserfs_write_begin(struct file *file, } } if (ret) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); /* Truncate allocated blocks */ reiserfs_truncate_failed_write(inode); } From 1262249d038a3823531dd220e5eced93bccf8d38 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 10 Jul 2024 13:43:36 -0400 Subject: [PATCH 035/168] block: Use a folio in blkdev_write_end() Replaces two hidden calls to compound_head() with one explicit one. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- block/fops.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/block/fops.c b/block/fops.c index 9825c1713a49..da44fedb23e5 100644 --- a/block/fops.c +++ b/block/fops.c @@ -460,11 +460,12 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { + struct folio *folio = page_folio(page); int ret; ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return ret; } From 696876d03542a1c348e004511c4a307770481286 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 10 Jul 2024 13:46:15 -0400 Subject: [PATCH 036/168] buffer: Use a folio in generic_write_end() Replaces two implicit calls to compound_head() with one. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/buffer.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index a9aeb04da398..448338810802 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2283,6 +2283,7 @@ int generic_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { + struct folio *folio = page_folio(page); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; bool i_size_changed = false; @@ -2293,7 +2294,7 @@ int generic_write_end(struct file *file, struct address_space *mapping, * No need to use i_size_read() here, the i_size cannot change under us * because we hold i_rwsem. * - * But it's important to update i_size while still holding page lock: + * But it's important to update i_size while still holding folio lock: * page writeout could otherwise come in and zero beyond i_size. */ if (pos + copied > inode->i_size) { @@ -2301,8 +2302,8 @@ int generic_write_end(struct file *file, struct address_space *mapping, i_size_changed = true; } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); if (old_size < pos) pagecache_isize_extended(inode, old_size, pos); From 663459c851995a6dc216d83d9dea2e2aa18a2db7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 10 Jul 2024 13:49:53 -0400 Subject: [PATCH 037/168] nilfs2: Use a folio in nilfs_recover_dsync_blocks() Replaces four hidden calls to compound_head() with one. Reviewed-by: Josef Bacik Acked-by: Ryusuke Konishi Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/nilfs2/recovery.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index b638dc06df2f..15653701b1c8 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -499,6 +499,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, struct nilfs_recovery_block *rb, *n; unsigned int blocksize = nilfs->ns_blocksize; struct page *page; + struct folio *folio; loff_t pos; int err = 0, err2 = 0; @@ -522,6 +523,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, goto failed_inode; } + folio = page_folio(page); err = nilfs_recovery_copy_block(nilfs, rb, pos, page); if (unlikely(err)) goto failed_page; @@ -533,15 +535,15 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, block_write_end(NULL, inode->i_mapping, pos, blocksize, blocksize, page, NULL); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); (*nr_salvaged_blocks)++; goto next; failed_page: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); failed_inode: nilfs_warn(sb, From c4c9c89c8c8ec1bcb0fad951b91da0d3a7569a71 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 10 Jul 2024 14:44:14 -0400 Subject: [PATCH 038/168] ntfs3: Remove reset_log_file() This function has no callers (which will be why nobody noticed that the page wasn't being unlocked). Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/ntfs3/inode.c | 39 --------------------------------------- fs/ntfs3/ntfs_fs.h | 1 - 2 files changed, 40 deletions(-) diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 6b0bdc474e76..8eaaf9e465d4 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -1008,45 +1008,6 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, return err; } -int reset_log_file(struct inode *inode) -{ - int err; - loff_t pos = 0; - u32 log_size = inode->i_size; - struct address_space *mapping = inode->i_mapping; - - for (;;) { - u32 len; - void *kaddr; - struct page *page; - - len = pos + PAGE_SIZE > log_size ? (log_size - pos) : PAGE_SIZE; - - err = block_write_begin(mapping, pos, len, &page, - ntfs_get_block_write_begin); - if (err) - goto out; - - kaddr = kmap_atomic(page); - memset(kaddr, -1, len); - kunmap_atomic(kaddr); - flush_dcache_page(page); - - err = block_write_end(NULL, mapping, pos, len, len, page, NULL); - if (err < 0) - goto out; - pos += len; - - if (pos >= log_size) - break; - balance_dirty_pages_ratelimited(mapping); - } -out: - mark_inode_dirty_sync(inode); - - return err; -} - int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc) { return _ni_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index e5255a251929..4e363b8342d6 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -708,7 +708,6 @@ int indx_update_dup(struct ntfs_inode *ni, struct ntfs_sb_info *sbi, struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref, const struct cpu_str *name); int ntfs_set_size(struct inode *inode, u64 new_size); -int reset_log_file(struct inode *inode); int ntfs_get_block(struct inode *inode, sector_t vbn, struct buffer_head *bh_result, int create); int ntfs_write_begin(struct file *file, struct address_space *mapping, From 97edbc02b2efdb0cd0f507b6a45fc509acc861bb Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 10 Jul 2024 14:51:11 -0400 Subject: [PATCH 039/168] buffer: Convert block_write_end() to take a folio All callers now have a folio, so pass it in instead of converting from a folio to a page and back to a folio again. Saves a call to compound_head(). Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- block/fops.c | 2 +- fs/buffer.c | 5 ++--- fs/ext2/dir.c | 2 +- fs/ext4/inode.c | 4 ++-- fs/iomap/buffered-io.c | 2 +- fs/minix/dir.c | 2 +- fs/nilfs2/dir.c | 2 +- fs/nilfs2/recovery.c | 2 +- fs/sysv/dir.c | 2 +- fs/ufs/dir.c | 2 +- include/linux/buffer_head.h | 4 ++-- 11 files changed, 14 insertions(+), 15 deletions(-) diff --git a/block/fops.c b/block/fops.c index da44fedb23e5..df0e762cf397 100644 --- a/block/fops.c +++ b/block/fops.c @@ -462,7 +462,7 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping, { struct folio *folio = page_folio(page); int ret; - ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); + ret = block_write_end(file, mapping, pos, len, copied, folio, fsdata); folio_unlock(folio); folio_put(folio); diff --git a/fs/buffer.c b/fs/buffer.c index 448338810802..acba3dfe55d8 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2247,9 +2247,8 @@ EXPORT_SYMBOL(block_write_begin); int block_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); size_t start = pos - folio_pos(folio); if (unlikely(copied < len)) { @@ -2288,7 +2287,7 @@ int generic_write_end(struct file *file, struct address_space *mapping, loff_t old_size = inode->i_size; bool i_size_changed = false; - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata); /* * No need to use i_size_read() here, the i_size cannot change under us diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 087457061c6e..60605fbdd0eb 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -87,7 +87,7 @@ static void ext2_commit_chunk(struct folio *folio, loff_t pos, unsigned len) struct inode *dir = mapping->host; inode_inc_iversion(dir); - block_write_end(NULL, mapping, pos, len, len, &folio->page, NULL); + block_write_end(NULL, mapping, pos, len, len, folio, NULL); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 941c1c0d5c6e..1e4831d83adc 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1315,7 +1315,7 @@ static int ext4_write_end(struct file *file, return ext4_write_inline_data_end(inode, pos, len, copied, folio); - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata); /* * it's important to update i_size while still holding folio lock: * page writeout could otherwise come in and zero beyond i_size. @@ -3029,7 +3029,7 @@ static int ext4_da_do_write_end(struct address_space *mapping, * flag, which all that's needed to trigger page writeback. */ copied = block_write_end(NULL, mapping, pos, len, copied, - &folio->page, NULL); + folio, NULL); new_i_size = pos + copied; /* diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index f420c53d86ac..9b4ca3811a24 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -900,7 +900,7 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, size_t bh_written; bh_written = block_write_end(NULL, iter->inode->i_mapping, pos, - len, copied, &folio->page, NULL); + len, copied, folio, NULL); WARN_ON_ONCE(bh_written != copied && bh_written != 0); return bh_written == copied; } diff --git a/fs/minix/dir.c b/fs/minix/dir.c index 5f9e2fc91003..dd2a425b41f0 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -45,7 +45,7 @@ static void dir_commit_chunk(struct folio *folio, loff_t pos, unsigned len) struct address_space *mapping = folio->mapping; struct inode *dir = mapping->host; - block_write_end(NULL, mapping, pos, len, len, &folio->page, NULL); + block_write_end(NULL, mapping, pos, len, len, folio, NULL); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 4a29b0138d75..66af42f88ca7 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -96,7 +96,7 @@ static void nilfs_commit_chunk(struct folio *folio, int err; nr_dirty = nilfs_page_count_clean_buffers(&folio->page, from, to); - copied = block_write_end(NULL, mapping, pos, len, len, &folio->page, NULL); + copied = block_write_end(NULL, mapping, pos, len, len, folio, NULL); if (pos + copied > dir->i_size) i_size_write(dir, pos + copied); if (IS_DIRSYNC(dir)) diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index 15653701b1c8..40c5dfbc9d41 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -533,7 +533,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, goto failed_page; block_write_end(NULL, inode->i_mapping, pos, blocksize, - blocksize, page, NULL); + blocksize, folio, NULL); folio_unlock(folio); folio_put(folio); diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 27eaa5273ba7..639307e2ff8c 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -33,7 +33,7 @@ static void dir_commit_chunk(struct folio *folio, loff_t pos, unsigned len) struct address_space *mapping = folio->mapping; struct inode *dir = mapping->host; - block_write_end(NULL, mapping, pos, len, len, &folio->page, NULL); + block_write_end(NULL, mapping, pos, len, len, folio, NULL); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); mark_inode_dirty(dir); diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 3b3cd84f1f7f..1579561118f5 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -48,7 +48,7 @@ static void ufs_commit_chunk(struct folio *folio, loff_t pos, unsigned len) struct inode *dir = mapping->host; inode_inc_iversion(dir); - block_write_end(NULL, mapping, pos, len, len, &folio->page, NULL); + block_write_end(NULL, mapping, pos, len, len, folio, NULL); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); mark_inode_dirty(dir); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 14acf1bbe0ce..3a3fec154536 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -262,8 +262,8 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, int __block_write_begin(struct page *page, loff_t pos, unsigned len, get_block_t *get_block); int block_write_end(struct file *, struct address_space *, - loff_t, unsigned, unsigned, - struct page *, void *); + loff_t, unsigned len, unsigned copied, + struct folio *, void *); int generic_write_end(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page *, void *); From efe2f7a4132d8c399f9238a320df5edb0d62fd48 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 10 Jul 2024 16:42:35 -0400 Subject: [PATCH 040/168] ecryptfs: Convert ecryptfs_write_end() to use a folio Convert the passed page to a folio and operate on that. Replaces four calls to compound_head() with one. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/ecryptfs/mmap.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index e2483acc4366..9b86fad2b9d1 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -234,17 +234,17 @@ out: /* * Called with lower inode mutex held. */ -static int fill_zeros_to_end_of_page(struct page *page, unsigned int to) +static int fill_zeros_to_end_of_page(struct folio *folio, unsigned int to) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; int end_byte_in_page; - if ((i_size_read(inode) / PAGE_SIZE) != page->index) + if ((i_size_read(inode) / PAGE_SIZE) != folio->index) goto out; end_byte_in_page = i_size_read(inode) % PAGE_SIZE; if (to > end_byte_in_page) end_byte_in_page = to; - zero_user_segment(page, end_byte_in_page, PAGE_SIZE); + folio_zero_segment(folio, end_byte_in_page, PAGE_SIZE); out: return 0; } @@ -465,6 +465,7 @@ static int ecryptfs_write_end(struct file *file, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { + struct folio *folio = page_folio(page); pgoff_t index = pos >> PAGE_SHIFT; unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + copied; @@ -476,8 +477,8 @@ static int ecryptfs_write_end(struct file *file, ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page" "(page w/ index = [0x%.16lx], to = [%d])\n", index, to); if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { - rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0, - to); + rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, + &folio->page, 0, to); if (!rc) { rc = copied; fsstack_copy_inode_size(ecryptfs_inode, @@ -485,21 +486,21 @@ static int ecryptfs_write_end(struct file *file, } goto out; } - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { if (copied < PAGE_SIZE) { rc = 0; goto out; } - SetPageUptodate(page); + folio_mark_uptodate(folio); } /* Fills in zeros if 'to' goes beyond inode size */ - rc = fill_zeros_to_end_of_page(page, to); + rc = fill_zeros_to_end_of_page(folio, to); if (rc) { ecryptfs_printk(KERN_WARNING, "Error attempting to fill " "zeros in page with index = [0x%.16lx]\n", index); goto out; } - rc = ecryptfs_encrypt_page(page); + rc = ecryptfs_encrypt_page(&folio->page); if (rc) { ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper " "index [0x%.16lx])\n", index); @@ -518,8 +519,8 @@ static int ecryptfs_write_end(struct file *file, else rc = copied; out: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return rc; } From 6a09084cd4e67ab2e454cf6da6737383a7c6d41c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 11 Jul 2024 11:42:46 -0400 Subject: [PATCH 041/168] ecryptfs: Use a folio in ecryptfs_write_begin() Use __filemap_get_folio() instead of grab_cache_page_write_begin() and use the folio throughout. No attempt is made here to support large folios, simply converting this function to use folio APIs is the goal. Saves many hidden calls to compound_head(). Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/ecryptfs/mmap.c | 53 +++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 9b86fad2b9d1..75ce28d757b7 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -268,35 +268,36 @@ static int ecryptfs_write_begin(struct file *file, struct page **pagep, void **fsdata) { pgoff_t index = pos >> PAGE_SHIFT; - struct page *page; + struct folio *folio; loff_t prev_page_end_size; int rc = 0; - page = grab_cache_page_write_begin(mapping, index); - if (!page) - return -ENOMEM; - *pagep = page; + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); + *pagep = &folio->page; prev_page_end_size = ((loff_t)index << PAGE_SHIFT); - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { struct ecryptfs_crypt_stat *crypt_stat = &ecryptfs_inode_to_private(mapping->host)->crypt_stat; if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { rc = ecryptfs_read_lower_page_segment( - page, index, 0, PAGE_SIZE, mapping->host); + &folio->page, index, 0, PAGE_SIZE, mapping->host); if (rc) { printk(KERN_ERR "%s: Error attempting to read " "lower page segment; rc = [%d]\n", __func__, rc); - ClearPageUptodate(page); + folio_clear_uptodate(folio); goto out; } else - SetPageUptodate(page); + folio_mark_uptodate(folio); } else if (crypt_stat->flags & ECRYPTFS_VIEW_AS_ENCRYPTED) { if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) { rc = ecryptfs_copy_up_encrypted_with_header( - page, crypt_stat); + &folio->page, crypt_stat); if (rc) { printk(KERN_ERR "%s: Error attempting " "to copy the encrypted content " @@ -304,46 +305,46 @@ static int ecryptfs_write_begin(struct file *file, "inserting the metadata from " "the xattr into the header; rc " "= [%d]\n", __func__, rc); - ClearPageUptodate(page); + folio_clear_uptodate(folio); goto out; } - SetPageUptodate(page); + folio_mark_uptodate(folio); } else { rc = ecryptfs_read_lower_page_segment( - page, index, 0, PAGE_SIZE, + &folio->page, index, 0, PAGE_SIZE, mapping->host); if (rc) { printk(KERN_ERR "%s: Error reading " "page; rc = [%d]\n", __func__, rc); - ClearPageUptodate(page); + folio_clear_uptodate(folio); goto out; } - SetPageUptodate(page); + folio_mark_uptodate(folio); } } else { if (prev_page_end_size - >= i_size_read(page->mapping->host)) { - zero_user(page, 0, PAGE_SIZE); - SetPageUptodate(page); + >= i_size_read(mapping->host)) { + folio_zero_range(folio, 0, PAGE_SIZE); + folio_mark_uptodate(folio); } else if (len < PAGE_SIZE) { - rc = ecryptfs_decrypt_page(page); + rc = ecryptfs_decrypt_page(&folio->page); if (rc) { printk(KERN_ERR "%s: Error decrypting " "page at index [%ld]; " "rc = [%d]\n", - __func__, page->index, rc); - ClearPageUptodate(page); + __func__, folio->index, rc); + folio_clear_uptodate(folio); goto out; } - SetPageUptodate(page); + folio_mark_uptodate(folio); } } } /* If creating a page or more of holes, zero them out via truncate. * Note, this will increase i_size. */ if (index != 0) { - if (prev_page_end_size > i_size_read(page->mapping->host)) { + if (prev_page_end_size > i_size_read(mapping->host)) { rc = ecryptfs_truncate(file->f_path.dentry, prev_page_end_size); if (rc) { @@ -359,11 +360,11 @@ static int ecryptfs_write_begin(struct file *file, * of page? Zero it out. */ if ((i_size_read(mapping->host) == prev_page_end_size) && (pos != 0)) - zero_user(page, 0, PAGE_SIZE); + folio_zero_range(folio, 0, PAGE_SIZE); out: if (unlikely(rc)) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); *pagep = NULL; } return rc; From a0f858d450ce7b18fd2b7db6f22b321c2f28ed89 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 10 Jul 2024 16:42:35 -0400 Subject: [PATCH 042/168] f2fs: Convert f2fs_write_end() to use a folio Convert the passed page to a folio and operate on that. Replaces five calls to compound_head() with one. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/f2fs/data.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 6457e5bca9c9..58ac23e124a5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3687,7 +3687,8 @@ static int f2fs_write_end(struct file *file, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { - struct inode *inode = page->mapping->host; + struct folio *folio = page_folio(page); + struct inode *inode = folio->mapping->host; trace_f2fs_write_end(inode, pos, len, copied); @@ -3696,17 +3697,17 @@ static int f2fs_write_end(struct file *file, * should be PAGE_SIZE. Otherwise, we treat it with zero copied and * let generic_perform_write() try to copy data again through copied=0. */ - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { if (unlikely(copied != len)) copied = 0; else - SetPageUptodate(page); + folio_mark_uptodate(folio); } #ifdef CONFIG_F2FS_FS_COMPRESSION /* overwrite compressed file */ if (f2fs_compressed_file(inode) && fsdata) { - f2fs_compress_write_end(inode, fsdata, page->index, copied); + f2fs_compress_write_end(inode, fsdata, folio->index, copied); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); if (pos + copied > i_size_read(inode) && @@ -3719,7 +3720,7 @@ static int f2fs_write_end(struct file *file, if (!copied) goto unlock_out; - set_page_dirty(page); + folio_mark_dirty(folio); if (pos + copied > i_size_read(inode) && !f2fs_verity_in_progress(inode)) { @@ -3729,7 +3730,8 @@ static int f2fs_write_end(struct file *file, pos + copied); } unlock_out: - f2fs_put_page(page, 1); + folio_unlock(folio); + folio_put(folio); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return copied; } From dfd2e81d37e154f70f1e35d54ddb3d9df34595f2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 11 Jul 2024 16:58:06 -0400 Subject: [PATCH 043/168] f2fs: Convert f2fs_write_begin() to use a folio Fetch a folio from the page cache instead of a page and use it throughout. We still have to convert back to a page for calling internal f2fs functions, but hopefully they will be converted soon. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/f2fs/data.c | 66 ++++++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 31 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 58ac23e124a5..9a45f9fb8a64 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3556,8 +3556,8 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct page *page = NULL; - pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT; + struct folio *folio; + pgoff_t index = pos >> PAGE_SHIFT; bool need_balance = false; bool use_cow = false; block_t blkaddr = NULL_ADDR; @@ -3573,7 +3573,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, /* * We should check this at this moment to avoid deadlock on inode page * and #0 page. The locking rule for inline_data conversion should be: - * lock_page(page #0) -> lock_page(inode_page) + * folio_lock(folio #0) -> folio_lock(inode_page) */ if (index != 0) { err = f2fs_convert_inline_inode(inode); @@ -3603,81 +3603,85 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, repeat: /* - * Do not use grab_cache_page_write_begin() to avoid deadlock due to - * wait_for_stable_page. Will wait that below with our IO control. + * Do not use FGP_STABLE to avoid deadlock. + * Will wait that below with our IO control. */ - page = f2fs_pagecache_get_page(mapping, index, + folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_WRITE | FGP_CREAT, GFP_NOFS); - if (!page) { - err = -ENOMEM; + if (IS_ERR(folio)) { + err = PTR_ERR(folio); goto fail; } /* TODO: cluster can be compressed due to race with .writepage */ - *pagep = page; + *pagep = &folio->page; if (f2fs_is_atomic_file(inode)) - err = prepare_atomic_write_begin(sbi, page, pos, len, + err = prepare_atomic_write_begin(sbi, &folio->page, pos, len, &blkaddr, &need_balance, &use_cow); else - err = prepare_write_begin(sbi, page, pos, len, + err = prepare_write_begin(sbi, &folio->page, pos, len, &blkaddr, &need_balance); if (err) - goto fail; + goto put_folio; if (need_balance && !IS_NOQUOTA(inode) && has_not_enough_free_secs(sbi, 0, 0)) { - unlock_page(page); + folio_unlock(folio); f2fs_balance_fs(sbi, true); - lock_page(page); - if (page->mapping != mapping) { - /* The page got truncated from under us */ - f2fs_put_page(page, 1); + folio_lock(folio); + if (folio->mapping != mapping) { + /* The folio got truncated from under us */ + folio_unlock(folio); + folio_put(folio); goto repeat; } } - f2fs_wait_on_page_writeback(page, DATA, false, true); + f2fs_wait_on_page_writeback(&folio->page, DATA, false, true); - if (len == PAGE_SIZE || PageUptodate(page)) + if (len == folio_size(folio) || folio_test_uptodate(folio)) return 0; if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode) && !f2fs_verity_in_progress(inode)) { - zero_user_segment(page, len, PAGE_SIZE); + folio_zero_segment(folio, len, PAGE_SIZE); return 0; } if (blkaddr == NEW_ADDR) { - zero_user_segment(page, 0, PAGE_SIZE); - SetPageUptodate(page); + folio_zero_segment(folio, 0, folio_size(folio)); + folio_mark_uptodate(folio); } else { if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ)) { err = -EFSCORRUPTED; - goto fail; + goto put_folio; } err = f2fs_submit_page_read(use_cow ? - F2FS_I(inode)->cow_inode : inode, page, + F2FS_I(inode)->cow_inode : inode, &folio->page, blkaddr, 0, true); if (err) - goto fail; + goto put_folio; - lock_page(page); - if (unlikely(page->mapping != mapping)) { - f2fs_put_page(page, 1); + folio_lock(folio); + if (unlikely(folio->mapping != mapping)) { + folio_unlock(folio); + folio_put(folio); goto repeat; } - if (unlikely(!PageUptodate(page))) { + if (unlikely(!folio_test_uptodate(folio))) { err = -EIO; - goto fail; + goto put_folio; } } return 0; +put_folio: + folio_unlock(folio); + folio_put(folio); fail: - f2fs_put_page(page, 1); f2fs_write_failed(inode, pos + len); return err; } From 556d0ac068d71b78c309d7444357df4fa55f594e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 10 Jul 2024 16:42:35 -0400 Subject: [PATCH 044/168] fuse: Convert fuse_write_end() to use a folio Convert the passed page to a folio and operate on that. Replaces five calls to compound_head() with one. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/fuse/file.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f39456c65ed7..f4102c6657af 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2434,29 +2434,30 @@ static int fuse_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { - struct inode *inode = page->mapping->host; + struct folio *folio = page_folio(page); + struct inode *inode = folio->mapping->host; /* Haven't copied anything? Skip zeroing, size extending, dirtying. */ if (!copied) goto unlock; pos += copied; - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { /* Zero any unwritten bytes at the end of the page */ size_t endoff = pos & ~PAGE_MASK; if (endoff) - zero_user_segment(page, endoff, PAGE_SIZE); - SetPageUptodate(page); + folio_zero_segment(folio, endoff, PAGE_SIZE); + folio_mark_uptodate(folio); } if (pos > inode->i_size) i_size_write(inode, pos); - set_page_dirty(page); + folio_mark_dirty(folio); unlock: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return copied; } From a060d835cf76605fbb784f1285a6d40e9239c436 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 11 Jul 2024 16:58:06 -0400 Subject: [PATCH 045/168] fuse: Convert fuse_write_begin() to use a folio Fetch a folio from the page cache instead of a page and use it throughout removing several calls to compound_head() and supporting large folios (in this function). We still have to convert back to a page for calling internal fuse functions, but hopefully they will be converted soon. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/fuse/file.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f4102c6657af..137485999d3d 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2391,41 +2391,42 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping, { pgoff_t index = pos >> PAGE_SHIFT; struct fuse_conn *fc = get_fuse_conn(file_inode(file)); - struct page *page; + struct folio *folio; loff_t fsize; int err = -ENOMEM; WARN_ON(!fc->writeback_cache); - page = grab_cache_page_write_begin(mapping, index); - if (!page) + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) goto error; - fuse_wait_on_page_writeback(mapping->host, page->index); + fuse_wait_on_page_writeback(mapping->host, folio->index); - if (PageUptodate(page) || len == PAGE_SIZE) + if (folio_test_uptodate(folio) || len >= folio_size(folio)) goto success; /* - * Check if the start this page comes after the end of file, in which - * case the readpage can be optimized away. + * Check if the start of this folio comes after the end of file, + * in which case the readpage can be optimized away. */ fsize = i_size_read(mapping->host); - if (fsize <= (pos & PAGE_MASK)) { - size_t off = pos & ~PAGE_MASK; + if (fsize <= folio_pos(folio)) { + size_t off = offset_in_folio(folio, pos); if (off) - zero_user_segment(page, 0, off); + folio_zero_segment(folio, 0, off); goto success; } - err = fuse_do_readpage(file, page); + err = fuse_do_readpage(file, &folio->page); if (err) goto cleanup; success: - *pagep = page; + *pagep = &folio->page; return 0; cleanup: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); error: return err; } From 02d8a3227f49f07cd8c2c4f42b4449f657c060c5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 12 Jul 2024 12:08:54 -0400 Subject: [PATCH 046/168] hostfs: Convert hostfs_write_end() to use a folio Convert the passed page to a folio and operate on that. Replaces four calls to compound_head() with one. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/hostfs/hostfs_kern.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 22df574ca99e..805cfe39c40f 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -479,17 +479,18 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { + struct folio *folio = page_folio(page); struct inode *inode = mapping->host; void *buffer; - unsigned from = pos & (PAGE_SIZE - 1); + size_t from = offset_in_folio(folio, pos); int err; - buffer = kmap_local_page(page); - err = write_file(FILE_HOSTFS_I(file)->fd, &pos, buffer + from, copied); + buffer = kmap_local_folio(folio, from); + err = write_file(FILE_HOSTFS_I(file)->fd, &pos, buffer, copied); kunmap_local(buffer); - if (!PageUptodate(page) && err == PAGE_SIZE) - SetPageUptodate(page); + if (!folio_test_uptodate(folio) && err == folio_size(folio)) + folio_mark_uptodate(folio); /* * If err > 0, write_file has added err to pos, so we are comparing @@ -497,8 +498,8 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping, */ if (err > 0 && (pos > inode->i_size)) inode->i_size = pos; - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return err; } From c8dbe54a2e0bf8cb9e7d36cdf16507d0025ef028 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 10 Jul 2024 16:42:35 -0400 Subject: [PATCH 047/168] jffs2: Convert jffs2_write_end() to use a folio Convert the passed page to a folio and operate on that. Replaces six calls to compound_head() with one. Also use kmap_local instead of kmap. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/jffs2/file.c | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index e12cb145147e..1a7dbf846c7c 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -240,6 +240,7 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *pg, void *fsdata) { + struct folio *folio = page_folio(pg); /* Actually commit the write from the page cache page we're looking at. * For now, we write the full page out each time. It sucks, but it's simple */ @@ -252,16 +253,17 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, unsigned aligned_start = start & ~3; int ret = 0; uint32_t writtenlen = 0; + void *buf; - jffs2_dbg(1, "%s(): ino #%lu, page at 0x%lx, range %d-%d, flags %lx\n", - __func__, inode->i_ino, pg->index << PAGE_SHIFT, - start, end, pg->flags); + jffs2_dbg(1, "%s(): ino #%lu, page at 0x%llx, range %d-%d, flags %lx\n", + __func__, inode->i_ino, folio_pos(folio), + start, end, folio->flags); /* We need to avoid deadlock with page_cache_read() in - jffs2_garbage_collect_pass(). So the page must be + jffs2_garbage_collect_pass(). So the folio must be up to date to prevent page_cache_read() from trying to re-lock it. */ - BUG_ON(!PageUptodate(pg)); + BUG_ON(!folio_test_uptodate(folio)); if (end == PAGE_SIZE) { /* When writing out the end of a page, write out the @@ -276,8 +278,8 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, if (!ri) { jffs2_dbg(1, "%s(): Allocation of raw inode failed\n", __func__); - unlock_page(pg); - put_page(pg); + folio_unlock(folio); + folio_put(folio); return -ENOMEM; } @@ -289,15 +291,11 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, ri->isize = cpu_to_je32((uint32_t)inode->i_size); ri->atime = ri->ctime = ri->mtime = cpu_to_je32(JFFS2_NOW()); - /* In 2.4, it was already kmapped by generic_file_write(). Doesn't - hurt to do it again. The alternative is ifdefs, which are ugly. */ - kmap(pg); - - ret = jffs2_write_inode_range(c, f, ri, page_address(pg) + aligned_start, - (pg->index << PAGE_SHIFT) + aligned_start, + buf = kmap_local_folio(folio, aligned_start); + ret = jffs2_write_inode_range(c, f, ri, buf, + folio_pos(folio) + aligned_start, end - aligned_start, &writtenlen); - - kunmap(pg); + kunmap_local(buf); if (ret) mapping_set_error(mapping, ret); @@ -323,12 +321,12 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, it gets reread */ jffs2_dbg(1, "%s(): Not all bytes written. Marking page !uptodate\n", __func__); - ClearPageUptodate(pg); + folio_clear_uptodate(folio); } jffs2_dbg(1, "%s() returning %d\n", __func__, writtenlen > 0 ? writtenlen : ret); - unlock_page(pg); - put_page(pg); + folio_unlock(folio); + folio_put(folio); return writtenlen > 0 ? writtenlen : ret; } From 0ee818cc42fc313b480e2e29ebe7b0dc14ccc156 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 11 Jul 2024 16:58:06 -0400 Subject: [PATCH 048/168] jffs2: Convert jffs2_write_begin() to use a folio Fetch a folio from the page cache instead of a page and use it throughout removing several calls to compound_head(). We still have to convert back to a page for calling internal jffs2 functions, but hopefully they will be converted soon. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/jffs2/file.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 1a7dbf846c7c..ee8f7f029b45 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -127,7 +127,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, void **fsdata) { - struct page *pg; + struct folio *folio; struct inode *inode = mapping->host; struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); @@ -206,29 +206,30 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, * page in read_cache_page(), which causes a deadlock. */ mutex_lock(&c->alloc_sem); - pg = grab_cache_page_write_begin(mapping, index); - if (!pg) { - ret = -ENOMEM; + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); goto release_sem; } - *pagep = pg; + *pagep = &folio->page; /* - * Read in the page if it wasn't already present. Cannot optimize away - * the whole page write case until jffs2_write_end can handle the + * Read in the folio if it wasn't already present. Cannot optimize away + * the whole folio write case until jffs2_write_end can handle the * case of a short-copy. */ - if (!PageUptodate(pg)) { + if (!folio_test_uptodate(folio)) { mutex_lock(&f->sem); - ret = jffs2_do_readpage_nolock(inode, pg); + ret = jffs2_do_readpage_nolock(inode, &folio->page); mutex_unlock(&f->sem); if (ret) { - unlock_page(pg); - put_page(pg); + folio_unlock(folio); + folio_put(folio); goto release_sem; } } - jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags); + jffs2_dbg(1, "end write_begin(). folio->flags %lx\n", folio->flags); release_sem: mutex_unlock(&c->alloc_sem); From 87969292a93f4afbd1179fb46ad02ac1dd275ca0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 10 Jul 2024 17:06:19 -0400 Subject: [PATCH 049/168] orangefs: Convert orangefs_write_end() to use a folio Convert the passed page to a folio and operate on that. Replaces five calls to compound_head() with one. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/orangefs/inode.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index fdb9b65db1de..6595417f62b1 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -367,7 +367,8 @@ okay: static int orangefs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { - struct inode *inode = page->mapping->host; + struct folio *folio = page_folio(page); + struct inode *inode = folio->mapping->host; loff_t last_pos = pos + copied; /* @@ -377,23 +378,23 @@ static int orangefs_write_end(struct file *file, struct address_space *mapping, if (last_pos > inode->i_size) i_size_write(inode, last_pos); - /* zero the stale part of the page if we did a short copy */ - if (!PageUptodate(page)) { + /* zero the stale part of the folio if we did a short copy */ + if (!folio_test_uptodate(folio)) { unsigned from = pos & (PAGE_SIZE - 1); if (copied < len) { - zero_user(page, from + copied, len - copied); + folio_zero_range(folio, from + copied, len - copied); } /* Set fully written pages uptodate. */ - if (pos == page_offset(page) && + if (pos == folio_pos(folio) && (len == PAGE_SIZE || pos + len == inode->i_size)) { - zero_user_segment(page, from + copied, PAGE_SIZE); - SetPageUptodate(page); + folio_zero_segment(folio, from + copied, PAGE_SIZE); + folio_mark_uptodate(folio); } } - set_page_dirty(page); - unlock_page(page); - put_page(page); + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); mark_inode_dirty_sync(file_inode(file)); return copied; From 4c7e13850f317933b081fd49841dd246bff99619 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 11 Jul 2024 16:58:06 -0400 Subject: [PATCH 050/168] orangefs: Convert orangefs_write_begin() to use a folio Retrieve a folio from the page cache instead of a page. This function was previously mostly converted to use a folio, so it's a fairly small change. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/orangefs/inode.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 6595417f62b1..e8440fa7d73c 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -313,18 +313,14 @@ static int orangefs_write_begin(struct file *file, { struct orangefs_write_range *wr; struct folio *folio; - struct page *page; - pgoff_t index; int ret; - index = pos >> PAGE_SHIFT; + folio = __filemap_get_folio(mapping, pos / PAGE_SIZE, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); - page = grab_cache_page_write_begin(mapping, index); - if (!page) - return -ENOMEM; - - *pagep = page; - folio = page_folio(page); + *pagep = &folio->page; if (folio_test_dirty(folio) && !folio_test_private(folio)) { /* From 3e5d37c5f98a06ee68a5c3e2784d4a4420e9d227 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 2 Jun 2024 00:02:10 -0400 Subject: [PATCH 051/168] vboxsf: Use a folio in vboxsf_write_end() Because we have to kmap() the page before calling vboxsf_write(), we can't entirely remove the use of struct page. But we can eliminate some uses of old APIs and remove some unnecessary calls to compound_head(). Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/vboxsf/file.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c index fdb4da24d662..029f106d56d9 100644 --- a/fs/vboxsf/file.c +++ b/fs/vboxsf/file.c @@ -302,16 +302,17 @@ static int vboxsf_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned int len, unsigned int copied, struct page *page, void *fsdata) { + struct folio *folio = page_folio(page); struct inode *inode = mapping->host; struct vboxsf_handle *sf_handle = file->private_data; - unsigned int from = pos & ~PAGE_MASK; + size_t from = offset_in_folio(folio, pos); u32 nwritten = len; u8 *buf; int err; - /* zero the stale part of the page if we did a short copy */ - if (!PageUptodate(page) && copied < len) - zero_user(page, from + copied, len - copied); + /* zero the stale part of the folio if we did a short copy */ + if (!folio_test_uptodate(folio) && copied < len) + folio_zero_range(folio, from + copied, len - copied); buf = kmap(page); err = vboxsf_write(sf_handle->root, sf_handle->handle, @@ -326,16 +327,16 @@ static int vboxsf_write_end(struct file *file, struct address_space *mapping, /* mtime changed */ VBOXSF_I(inode)->force_restat = 1; - if (!PageUptodate(page) && nwritten == PAGE_SIZE) - SetPageUptodate(page); + if (!folio_test_uptodate(folio) && nwritten == folio_size(folio)) + folio_mark_uptodate(folio); pos += nwritten; if (pos > inode->i_size) i_size_write(inode, pos); out: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return nwritten; } @@ -343,7 +344,7 @@ out: /* * Note simple_write_begin does not read the page from disk on partial writes * this is ok since vboxsf_write_end only writes the written parts of the - * page and it does not call SetPageUptodate for partial writes. + * page and it does not call folio_mark_uptodate for partial writes. */ const struct address_space_operations vboxsf_reg_aops = { .read_folio = vboxsf_read_folio, From a225800f322a3d6cc8b8b6c7dc4d5281f2f5375b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 10 Jul 2024 15:45:32 -0400 Subject: [PATCH 052/168] fs: Convert aops->write_end to take a folio Most callers have a folio, and most implementations operate on a folio, so remove the conversion from folio->page->folio to fit through this interface. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- Documentation/filesystems/locking.rst | 2 +- Documentation/filesystems/vfs.rst | 6 +++--- block/fops.c | 3 +-- drivers/gpu/drm/i915/gem/i915_gem_shmem.c | 4 ++-- fs/affs/file.c | 9 ++++----- fs/bcachefs/fs-io-buffered.c | 3 +-- fs/bcachefs/fs-io-buffered.h | 2 +- fs/buffer.c | 9 ++++----- fs/ceph/addr.c | 3 +-- fs/ecryptfs/mmap.c | 5 ++--- fs/exfat/file.c | 2 +- fs/exfat/inode.c | 4 ++-- fs/ext2/inode.c | 4 ++-- fs/ext4/inode.c | 11 ++++------- fs/ext4/verity.c | 2 +- fs/f2fs/data.c | 3 +-- fs/f2fs/super.c | 2 +- fs/f2fs/verity.c | 2 +- fs/fat/inode.c | 4 ++-- fs/fuse/file.c | 3 +-- fs/hfs/extent.c | 2 +- fs/hfsplus/extents.c | 2 +- fs/hostfs/hostfs_kern.c | 3 +-- fs/hpfs/file.c | 4 ++-- fs/hugetlbfs/inode.c | 2 +- fs/jffs2/file.c | 5 ++--- fs/jfs/inode.c | 4 ++-- fs/libfs.c | 9 ++++----- fs/namei.c | 2 +- fs/nfs/file.c | 3 +-- fs/nilfs2/inode.c | 6 +++--- fs/ntfs3/file.c | 2 +- fs/ntfs3/inode.c | 5 ++--- fs/ntfs3/ntfs_fs.h | 2 +- fs/ocfs2/aops.c | 2 +- fs/orangefs/inode.c | 4 ++-- fs/reiserfs/inode.c | 11 +++++------ fs/ubifs/file.c | 3 +-- fs/udf/inode.c | 6 ++---- fs/ufs/inode.c | 4 ++-- fs/vboxsf/file.c | 7 +++---- include/linux/buffer_head.h | 4 ++-- include/linux/fs.h | 2 +- mm/filemap.c | 2 +- mm/shmem.c | 3 +-- 45 files changed, 80 insertions(+), 102 deletions(-) diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index e664061ed55d..827fb5a073b7 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -254,7 +254,7 @@ prototypes:: struct page **pagep, void **fsdata); int (*write_end)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata); + struct folio *folio, void *fsdata); sector_t (*bmap)(struct address_space *, sector_t); void (*invalidate_folio) (struct folio *, size_t start, size_t len); bool (*release_folio)(struct folio *, gfp_t); diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 6e903a903f8f..0e24f770c568 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -810,7 +810,7 @@ cache in your filesystem. The following members are defined: struct page **pagep, void **fsdata); int (*write_end)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata); + struct folio *folio, void *fsdata); sector_t (*bmap)(struct address_space *, sector_t); void (*invalidate_folio) (struct folio *, size_t start, size_t len); bool (*release_folio)(struct folio *, gfp_t); @@ -944,8 +944,8 @@ cache in your filesystem. The following members are defined: called. len is the original len passed to write_begin, and copied is the amount that was able to be copied. - The filesystem must take care of unlocking the page and - releasing it refcount, and updating i_size. + The filesystem must take care of unlocking the folio, + decrementing its refcount, and updating i_size. Returns < 0 on failure, otherwise the number of bytes (<= 'copied') that were able to be copied into pagecache. diff --git a/block/fops.c b/block/fops.c index df0e762cf397..cd96a6f17119 100644 --- a/block/fops.c +++ b/block/fops.c @@ -457,10 +457,9 @@ static int blkdev_write_begin(struct file *file, struct address_space *mapping, } static int blkdev_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, struct page *page, + loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); int ret; ret = block_write_end(file, mapping, pos, len, copied, folio, fsdata); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c index c5e1c718a6d2..3513165a2964 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c @@ -494,7 +494,7 @@ shmem_pwrite(struct drm_i915_gem_object *obj, kunmap_local(vaddr); err = aops->write_end(obj->base.filp, mapping, offset, len, - len - unwritten, page, data); + len - unwritten, page_folio(page), data); if (err < 0) return err; @@ -688,7 +688,7 @@ i915_gem_object_create_shmem_from_data(struct drm_i915_private *i915, kunmap(page); err = aops->write_end(file, file->f_mapping, offset, len, len, - page, pgdata); + page_folio(page), pgdata); if (err < 0) goto fail; diff --git a/fs/affs/file.c b/fs/affs/file.c index 04c018e19602..6a6c5bc41b8f 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -433,12 +433,12 @@ static int affs_write_begin(struct file *file, struct address_space *mapping, static int affs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned int len, unsigned int copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; int ret; - ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); /* Clear Archived bit on file writes, as AmigaOS would do */ if (AFFS_I(inode)->i_protect & FIBF_ARCHIVED) { @@ -687,9 +687,8 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping static int affs_write_end_ofs(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); struct inode *inode = mapping->host; struct super_block *sb = inode->i_sb; struct buffer_head *bh, *prev_bh; @@ -889,7 +888,7 @@ affs_truncate(struct inode *inode) res = mapping->a_ops->write_begin(NULL, mapping, isize, 0, &page, &fsdata); if (!res) - res = mapping->a_ops->write_end(NULL, mapping, isize, 0, 0, page, fsdata); + res = mapping->a_ops->write_end(NULL, mapping, isize, 0, 0, page_folio(page), fsdata); else inode->i_size = AFFS_I(inode)->mmu_private; mark_inode_dirty(inode); diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index cc33d763f722..46ae9ef3589a 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -743,12 +743,11 @@ err_unlock: int bch2_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch2_folio_reservation *res = fsdata; - struct folio *folio = page_folio(page); unsigned offset = pos - folio_pos(folio); lockdep_assert_held(&inode->v.i_rwsem); diff --git a/fs/bcachefs/fs-io-buffered.h b/fs/bcachefs/fs-io-buffered.h index a6126ff790e6..16569b9874e0 100644 --- a/fs/bcachefs/fs-io-buffered.h +++ b/fs/bcachefs/fs-io-buffered.h @@ -13,7 +13,7 @@ void bch2_readahead(struct readahead_control *); int bch2_write_begin(struct file *, struct address_space *, loff_t, unsigned, struct page **, void **); int bch2_write_end(struct file *, struct address_space *, loff_t, - unsigned, unsigned, struct page *, void *); + unsigned len, unsigned copied, struct folio *, void *); ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); diff --git a/fs/buffer.c b/fs/buffer.c index acba3dfe55d8..1a0f2f65e890 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2280,9 +2280,8 @@ EXPORT_SYMBOL(block_write_end); int generic_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; bool i_size_changed = false; @@ -2480,7 +2479,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size) if (err) goto out; - err = aops->write_end(NULL, mapping, size, 0, 0, page, fsdata); + err = aops->write_end(NULL, mapping, size, 0, 0, page_folio(page), fsdata); BUG_ON(err > 0); out: @@ -2518,7 +2517,7 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, goto out; zero_user(page, zerofrom, len); err = aops->write_end(file, mapping, curpos, len, len, - page, fsdata); + page_folio(page), fsdata); if (err < 0) goto out; BUG_ON(err != len); @@ -2551,7 +2550,7 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, goto out; zero_user(page, zerofrom, len); err = aops->write_end(file, mapping, curpos, len, len, - page, fsdata); + page_folio(page), fsdata); if (err < 0) goto out; BUG_ON(err != len); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 8c16bc5250ef..87369e2659c7 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1509,9 +1509,8 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, */ static int ceph_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *subpage, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(subpage); struct inode *inode = file_inode(file); struct ceph_client *cl = ceph_inode_to_client(inode); bool check_cap = false; diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 75ce28d757b7..f43e42ede75e 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -458,15 +458,14 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode) * @pos: The file position * @len: The length of the data (unused) * @copied: The amount of data copied - * @page: The eCryptfs page + * @folio: The eCryptfs folio * @fsdata: The fsdata (unused) */ static int ecryptfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); pgoff_t index = pos >> PAGE_SHIFT; unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + copied; diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 64c31867bc76..7144472d092e 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -548,7 +548,7 @@ static int exfat_file_zeroed_range(struct file *file, loff_t start, loff_t end) zero_user_segment(page, zerofrom, zerofrom + len); - err = ops->write_end(file, mapping, start, len, len, page, NULL); + err = ops->write_end(file, mapping, start, len, len, page_folio(page), NULL); if (err < 0) goto out; start += len; diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index dd894e558c91..871e9e3e407e 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -463,13 +463,13 @@ static int exfat_write_begin(struct file *file, struct address_space *mapping, static int exfat_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned int len, unsigned int copied, - struct page *pagep, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; struct exfat_inode_info *ei = EXFAT_I(inode); int err; - err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata); + err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); if (ei->i_size_aligned < i_size_read(inode)) { exfat_fs_error(inode->i_sb, diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 0caa1650cee8..aba41f6150e5 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -928,11 +928,11 @@ ext2_write_begin(struct file *file, struct address_space *mapping, static int ext2_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { int ret; - ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); if (ret < len) ext2_write_failed(mapping, pos + len); return ret; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1e4831d83adc..e19ac0a82bdc 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1298,9 +1298,8 @@ static int write_end_fn(handle_t *handle, struct inode *inode, static int ext4_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; @@ -1402,9 +1401,8 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle, static int ext4_journalled_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; @@ -3080,15 +3078,14 @@ static int ext4_da_do_write_end(struct address_space *mapping, static int ext4_da_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; int write_mode = (int)(unsigned long)fsdata; - struct folio *folio = page_folio(page); if (write_mode == FALL_BACK_TO_NONDELALLOC) return ext4_write_end(file, mapping, pos, - len, copied, &folio->page, fsdata); + len, copied, folio, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index 2f37e1ea3955..86ef272296ab 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -86,7 +86,7 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count, memcpy_to_page(page, offset_in_page(pos), buf, n); - res = aops->write_end(NULL, mapping, pos, n, n, page, fsdata); + res = aops->write_end(NULL, mapping, pos, n, n, page_folio(page), fsdata); if (res < 0) return res; if (res != n) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9a45f9fb8a64..4c32e2cfc403 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3689,9 +3689,8 @@ fail: static int f2fs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); struct inode *inode = folio->mapping->host; trace_f2fs_write_end(inode, pos, len, copied); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3959fd137cc9..cfd38cd4f82c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2700,7 +2700,7 @@ retry: memcpy_to_page(page, offset, data, tocopy); a_ops->write_end(NULL, mapping, off, tocopy, tocopy, - page, fsdata); + page_folio(page), fsdata); offset = 0; towrite -= tocopy; off += tocopy; diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index f7bb0c54502c..486512144bf1 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -90,7 +90,7 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count, memcpy_to_page(page, offset_in_page(pos), buf, n); - res = aops->write_end(NULL, mapping, pos, n, n, page, fsdata); + res = aops->write_end(NULL, mapping, pos, n, n, page_folio(page), fsdata); if (res < 0) return res; if (res != n) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 19115fd2d2a4..8ed02b17d591 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -236,11 +236,11 @@ static int fat_write_begin(struct file *file, struct address_space *mapping, static int fat_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *pagep, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; int err; - err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata); + err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); if (err < len) fat_write_failed(mapping, pos + len); if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 137485999d3d..a5cd6e4f9b2b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2433,9 +2433,8 @@ error: static int fuse_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); struct inode *inode = folio->mapping->host; /* Haven't copied anything? Skip zeroing, size extending, dirtying. */ diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c index 6d1878b99b30..30512dbb79f0 100644 --- a/fs/hfs/extent.c +++ b/fs/hfs/extent.c @@ -495,7 +495,7 @@ void hfs_file_truncate(struct inode *inode) &fsdata); if (!res) { res = generic_write_end(NULL, mapping, size + 1, 0, 0, - page, fsdata); + page_folio(page), fsdata); } if (res) inode->i_size = HFS_I(inode)->phys_size; diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index 9c51867dddc5..776c0ea722cb 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -563,7 +563,7 @@ void hfsplus_file_truncate(struct inode *inode) if (res) return; res = generic_write_end(NULL, mapping, size, 0, 0, - page, fsdata); + page_folio(page), fsdata); if (res < 0) return; mark_inode_dirty(inode); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 805cfe39c40f..d0cbc5ece952 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -477,9 +477,8 @@ static int hostfs_write_begin(struct file *file, struct address_space *mapping, static int hostfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); struct inode *inode = mapping->host; void *buffer; size_t from = offset_in_folio(folio, pos); diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 1bb8d97cd9ae..11e2d9e612ac 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -206,11 +206,11 @@ static int hpfs_write_begin(struct file *file, struct address_space *mapping, static int hpfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *pagep, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; int err; - err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata); + err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); if (err < len) hpfs_write_failed(mapping, pos + len); if (!(err < 0)) { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 9f6cff356796..b7a30055e6f4 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -395,7 +395,7 @@ static int hugetlbfs_write_begin(struct file *file, static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { BUG(); return -EINVAL; diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index ee8f7f029b45..b6fe1f1e8ea9 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -23,7 +23,7 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *pg, void *fsdata); + struct folio *folio, void *fsdata); static int jffs2_write_begin(struct file *filp, struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, void **fsdata); @@ -239,9 +239,8 @@ out_err: static int jffs2_write_end(struct file *filp, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *pg, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(pg); /* Actually commit the write from the page cache page we're looking at. * For now, we write the full page out each time. It sucks, but it's simple */ diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 1a6b5921d17a..d63494182548 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -304,12 +304,12 @@ static int jfs_write_begin(struct file *file, struct address_space *mapping, } static int jfs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, struct page *page, + loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { int ret; - ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); if (ret < len) jfs_write_failed(mapping, pos + len); return ret; diff --git a/fs/libfs.c b/fs/libfs.c index 8aa34870449f..4581be1d5b32 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -929,11 +929,11 @@ EXPORT_SYMBOL(simple_write_begin); * @pos: " * @len: " * @copied: " - * @page: " + * @folio: " * @fsdata: " * - * simple_write_end does the minimum needed for updating a page after writing is - * done. It has the same API signature as the .write_end of + * simple_write_end does the minimum needed for updating a folio after + * writing is done. It has the same API signature as the .write_end of * address_space_operations vector. So it can just be set onto .write_end for * FSes that don't need any other processing. i_mutex is assumed to be held. * Block based filesystems should use generic_write_end(). @@ -946,9 +946,8 @@ EXPORT_SYMBOL(simple_write_begin); */ static int simple_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); struct inode *inode = folio->mapping->host; loff_t last_pos = pos + copied; diff --git a/fs/namei.c b/fs/namei.c index 5512cb10fa89..fb7055245e68 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -5321,7 +5321,7 @@ retry: memcpy(page_address(page), symname, len-1); err = aops->write_end(NULL, mapping, 0, len-1, len-1, - page, fsdata); + page_folio(page), fsdata); if (err < 0) goto fail; if (err < len-1) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 61a8cdb9f1e1..fe583dbebe54 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -372,10 +372,9 @@ start: static int nfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { struct nfs_open_context *ctx = nfs_file_open_context(file); - struct folio *folio = page_folio(page); unsigned offset = offset_in_folio(folio, pos); int status; diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 7340a01d80e1..25841b75415a 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -269,16 +269,16 @@ static int nilfs_write_begin(struct file *file, struct address_space *mapping, static int nilfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; unsigned int start = pos & (PAGE_SIZE - 1); unsigned int nr_dirty; int err; - nr_dirty = nilfs_page_count_clean_buffers(page, start, + nr_dirty = nilfs_page_count_clean_buffers(&folio->page, start, start + copied); - copied = generic_write_end(file, mapping, pos, len, copied, page, + copied = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); nilfs_set_file_dirty(inode, nr_dirty); err = nilfs_transaction_commit(inode->i_sb); diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index ca1ddc46bd86..88a7d81cf2ba 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -215,7 +215,7 @@ static int ntfs_extend_initialized_size(struct file *file, zero_user_segment(page, zerofrom, PAGE_SIZE); /* This function in any case puts page. */ - err = ntfs_write_end(file, mapping, pos, len, len, page, NULL); + err = ntfs_write_end(file, mapping, pos, len, len, page_folio(page), NULL); if (err < 0) goto out; pos += len; diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 8eaaf9e465d4..9efd3f7c59d6 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -947,9 +947,8 @@ out: * ntfs_write_end - Address_space_operations::write_end. */ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, - u32 len, u32 copied, struct page *page, void *fsdata) + u32 len, u32 copied, struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); struct inode *inode = mapping->host; struct ntfs_inode *ni = ntfs_i(inode); u64 valid = ni->i_valid; @@ -979,7 +978,7 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, folio_unlock(folio); folio_put(folio); } else { - err = generic_write_end(file, mapping, pos, len, copied, page, + err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); } diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 4e363b8342d6..53517281f26b 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -713,7 +713,7 @@ int ntfs_get_block(struct inode *inode, sector_t vbn, int ntfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, u32 len, struct page **pagep, void **fsdata); int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, - u32 len, u32 copied, struct page *page, void *fsdata); + u32 len, u32 copied, struct folio *folio, void *fsdata); int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc); int ntfs_sync_inode(struct inode *inode); int ntfs_flush_inodes(struct super_block *sb, struct inode *i1, diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 6be175a1ab3c..9d8aa417e8da 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2076,7 +2076,7 @@ out: static int ocfs2_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { int ret; struct inode *inode = mapping->host; diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index e8440fa7d73c..69b507a611f8 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -361,9 +361,9 @@ okay: } static int orangefs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) + loff_t pos, unsigned len, unsigned copied, struct folio *folio, + void *fsdata) { - struct folio *folio = page_folio(page); struct inode *inode = folio->mapping->host; loff_t last_pos = pos + copied; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 33da519c9767..f4b9db4b4df2 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2862,10 +2862,9 @@ static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block) static int reiserfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; int ret = 0; int update_sd = 0; struct reiserfs_transaction_handle *th; @@ -2887,7 +2886,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, } flush_dcache_folio(folio); - reiserfs_commit_page(inode, page, start, start + copied); + reiserfs_commit_page(inode, &folio->page, start, start + copied); /* * generic_commit_write does this for us, but does not update the @@ -2942,8 +2941,8 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, out: if (locked) reiserfs_write_unlock(inode->i_sb); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); if (pos + len > inode->i_size) reiserfs_truncate_failed_write(inode); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 68e104423a48..baa3eecc32fe 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -524,9 +524,8 @@ static void cancel_budget(struct ubifs_info *c, struct folio *folio, static int ubifs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); struct inode *inode = mapping->host; struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_info *c = inode->i_sb->s_fs_info; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 4726a4d014b6..fdf024e0b772 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -273,16 +273,14 @@ static int udf_write_begin(struct file *file, struct address_space *mapping, static int udf_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { struct inode *inode = file_inode(file); - struct folio *folio; loff_t last_pos; if (UDF_I(inode)->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) - return generic_write_end(file, mapping, pos, len, copied, page, + return generic_write_end(file, mapping, pos, len, copied, folio, fsdata); - folio = page_folio(page); last_pos = pos + copied; if (last_pos > inode->i_size) i_size_write(inode, last_pos); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 0e608fc0d0fd..69adb198c634 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -511,11 +511,11 @@ static int ufs_write_begin(struct file *file, struct address_space *mapping, static int ufs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { int ret; - ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata); if (ret < len) ufs_write_failed(mapping, pos + len); return ret; diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c index 029f106d56d9..b780deb81b02 100644 --- a/fs/vboxsf/file.c +++ b/fs/vboxsf/file.c @@ -300,9 +300,8 @@ static int vboxsf_writepage(struct page *page, struct writeback_control *wbc) static int vboxsf_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned int len, unsigned int copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); struct inode *inode = mapping->host; struct vboxsf_handle *sf_handle = file->private_data; size_t from = offset_in_folio(folio, pos); @@ -314,10 +313,10 @@ static int vboxsf_write_end(struct file *file, struct address_space *mapping, if (!folio_test_uptodate(folio) && copied < len) folio_zero_range(folio, from + copied, len - copied); - buf = kmap(page); + buf = kmap(&folio->page); err = vboxsf_write(sf_handle->root, sf_handle->handle, pos, &nwritten, buf + from); - kunmap(page); + kunmap(&folio->page); if (err) { nwritten = 0; diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 3a3fec154536..165e859664a5 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -265,8 +265,8 @@ int block_write_end(struct file *, struct address_space *, loff_t, unsigned len, unsigned copied, struct folio *, void *); int generic_write_end(struct file *, struct address_space *, - loff_t, unsigned, unsigned, - struct page *, void *); + loff_t, unsigned len, unsigned copied, + struct folio *, void *); void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to); int cont_write_begin(struct file *, struct address_space *, loff_t, unsigned, struct page **, void **, diff --git a/include/linux/fs.h b/include/linux/fs.h index fd34b5755c0b..1af3373c7cd2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -411,7 +411,7 @@ struct address_space_operations { struct page **pagep, void **fsdata); int (*write_end)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata); + struct folio *folio, void *fsdata); /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ sector_t (*bmap)(struct address_space *, sector_t); diff --git a/mm/filemap.c b/mm/filemap.c index d62150418b91..fab6b0c3044e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -4033,7 +4033,7 @@ retry: flush_dcache_folio(folio); status = a_ops->write_end(file, mapping, pos, bytes, copied, - page, fsdata); + folio, fsdata); if (unlikely(status != copied)) { iov_iter_revert(i, copied - max(status, 0L)); if (unlikely(status < 0)) diff --git a/mm/shmem.c b/mm/shmem.c index 2faa9daaf54b..1116f147d788 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2917,9 +2917,8 @@ shmem_write_begin(struct file *file, struct address_space *mapping, static int shmem_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct folio *folio = page_folio(page); struct inode *inode = mapping->host; if (pos + copied > inode->i_size) From 1da86618bdce301d23e89ecce92161f9d3b3c5e7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 15 Jul 2024 14:24:01 -0400 Subject: [PATCH 053/168] fs: Convert aops->write_begin to take a folio Convert all callers from working on a page to working on one page of a folio (support for working on an entire folio can come later). Removes a lot of folio->page->folio conversions. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- Documentation/filesystems/locking.rst | 4 +- Documentation/filesystems/vfs.rst | 6 +-- block/fops.c | 4 +- drivers/gpu/drm/i915/gem/i915_gem_shmem.c | 47 +++++++++++------------ fs/adfs/inode.c | 5 +-- fs/affs/file.c | 15 ++++---- fs/bcachefs/fs-io-buffered.c | 5 +-- fs/bcachefs/fs-io-buffered.h | 4 +- fs/bfs/file.c | 4 +- fs/buffer.c | 28 +++++++------- fs/ceph/addr.c | 10 ++--- fs/ecryptfs/mmap.c | 7 ++-- fs/exfat/file.c | 8 ++-- fs/exfat/inode.c | 5 +-- fs/ext2/inode.c | 4 +- fs/ext4/ext4.h | 4 +- fs/ext4/inline.c | 8 ++-- fs/ext4/inode.c | 14 +++---- fs/ext4/verity.c | 8 ++-- fs/f2fs/data.c | 8 ++-- fs/f2fs/super.c | 8 ++-- fs/f2fs/verity.c | 8 ++-- fs/fat/inode.c | 5 +-- fs/fuse/file.c | 4 +- fs/hfs/extent.c | 6 +-- fs/hfs/hfs_fs.h | 2 +- fs/hfs/inode.c | 5 +-- fs/hfsplus/extents.c | 6 +-- fs/hfsplus/hfsplus_fs.h | 2 +- fs/hfsplus/inode.c | 5 +-- fs/hostfs/hostfs_kern.c | 7 ++-- fs/hpfs/file.c | 5 +-- fs/hugetlbfs/inode.c | 2 +- fs/jffs2/file.c | 6 +-- fs/jfs/inode.c | 4 +- fs/libfs.c | 4 +- fs/minix/inode.c | 4 +- fs/namei.c | 10 ++--- fs/nfs/file.c | 4 +- fs/nilfs2/inode.c | 4 +- fs/nilfs2/recovery.c | 6 +-- fs/ntfs3/file.c | 9 ++--- fs/ntfs3/inode.c | 7 ++-- fs/ntfs3/ntfs_fs.h | 2 +- fs/ocfs2/aops.c | 10 ++--- fs/ocfs2/aops.h | 2 +- fs/ocfs2/mmap.c | 6 +-- fs/omfs/file.c | 4 +- fs/orangefs/inode.c | 4 +- fs/reiserfs/inode.c | 4 +- fs/sysv/itree.c | 4 +- fs/ubifs/file.c | 10 ++--- fs/udf/inode.c | 6 +-- fs/ufs/inode.c | 4 +- include/linux/buffer_head.h | 4 +- include/linux/fs.h | 4 +- mm/filemap.c | 4 +- mm/shmem.c | 8 ++-- 58 files changed, 190 insertions(+), 207 deletions(-) diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 827fb5a073b7..f5e3676db954 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -251,7 +251,7 @@ prototypes:: void (*readahead)(struct readahead_control *); int (*write_begin)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata); + struct folio **foliop, void **fsdata); int (*write_end)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata); @@ -280,7 +280,7 @@ read_folio: yes, unlocks shared writepages: dirty_folio: maybe readahead: yes, unlocks shared -write_begin: locks the page exclusive +write_begin: locks the folio exclusive write_end: yes, unlocks exclusive bmap: invalidate_folio: yes exclusive diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 0e24f770c568..4f67b5ea0568 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -926,12 +926,12 @@ cache in your filesystem. The following members are defined: (if they haven't been read already) so that the updated blocks can be written out properly. - The filesystem must return the locked pagecache page for the - specified offset, in ``*pagep``, for the caller to write into. + The filesystem must return the locked pagecache folio for the + specified offset, in ``*foliop``, for the caller to write into. It must be able to cope with short writes (where the length passed to write_begin is greater than the number of bytes copied - into the page). + into the folio). A void * may be returned in fsdata, which then gets passed into write_end. diff --git a/block/fops.c b/block/fops.c index cd96a6f17119..dacbd61fda29 100644 --- a/block/fops.c +++ b/block/fops.c @@ -451,9 +451,9 @@ static void blkdev_readahead(struct readahead_control *rac) } static int blkdev_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, void **fsdata) + loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { - return block_write_begin(mapping, pos, len, pagep, blkdev_get_block); + return block_write_begin(mapping, pos, len, foliop, blkdev_get_block); } static int blkdev_write_end(struct file *file, struct address_space *mapping, diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c index 3513165a2964..fe69f2c8527d 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c @@ -424,7 +424,8 @@ shmem_pwrite(struct drm_i915_gem_object *obj, struct address_space *mapping = obj->base.filp->f_mapping; const struct address_space_operations *aops = mapping->a_ops; char __user *user_data = u64_to_user_ptr(arg->data_ptr); - u64 remain, offset; + u64 remain; + loff_t pos; unsigned int pg; /* Caller already validated user args */ @@ -457,12 +458,12 @@ shmem_pwrite(struct drm_i915_gem_object *obj, */ remain = arg->size; - offset = arg->offset; - pg = offset_in_page(offset); + pos = arg->offset; + pg = offset_in_page(pos); do { unsigned int len, unwritten; - struct page *page; + struct folio *folio; void *data, *vaddr; int err; char __maybe_unused c; @@ -480,21 +481,19 @@ shmem_pwrite(struct drm_i915_gem_object *obj, if (err) return err; - err = aops->write_begin(obj->base.filp, mapping, offset, len, - &page, &data); + err = aops->write_begin(obj->base.filp, mapping, pos, len, + &folio, &data); if (err < 0) return err; - vaddr = kmap_local_page(page); + vaddr = kmap_local_folio(folio, offset_in_folio(folio, pos)); pagefault_disable(); - unwritten = __copy_from_user_inatomic(vaddr + pg, - user_data, - len); + unwritten = __copy_from_user_inatomic(vaddr, user_data, len); pagefault_enable(); kunmap_local(vaddr); - err = aops->write_end(obj->base.filp, mapping, offset, len, - len - unwritten, page_folio(page), data); + err = aops->write_end(obj->base.filp, mapping, pos, len, + len - unwritten, folio, data); if (err < 0) return err; @@ -504,7 +503,7 @@ shmem_pwrite(struct drm_i915_gem_object *obj, remain -= len; user_data += len; - offset += len; + pos += len; pg = 0; } while (remain); @@ -660,7 +659,7 @@ i915_gem_object_create_shmem_from_data(struct drm_i915_private *i915, struct drm_i915_gem_object *obj; struct file *file; const struct address_space_operations *aops; - resource_size_t offset; + loff_t pos; int err; GEM_WARN_ON(IS_DGFX(i915)); @@ -672,29 +671,27 @@ i915_gem_object_create_shmem_from_data(struct drm_i915_private *i915, file = obj->base.filp; aops = file->f_mapping->a_ops; - offset = 0; + pos = 0; do { unsigned int len = min_t(typeof(size), size, PAGE_SIZE); - struct page *page; - void *pgdata, *vaddr; + struct folio *folio; + void *fsdata; - err = aops->write_begin(file, file->f_mapping, offset, len, - &page, &pgdata); + err = aops->write_begin(file, file->f_mapping, pos, len, + &folio, &fsdata); if (err < 0) goto fail; - vaddr = kmap(page); - memcpy(vaddr, data, len); - kunmap(page); + memcpy_to_folio(folio, offset_in_folio(folio, pos), data, len); - err = aops->write_end(file, file->f_mapping, offset, len, len, - page_folio(page), pgdata); + err = aops->write_end(file, file->f_mapping, pos, len, len, + folio, fsdata); if (err < 0) goto fail; size -= len; data += len; - offset += len; + pos += len; } while (size); return obj; diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index a183e213a4a5..21527189e430 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -55,12 +55,11 @@ static void adfs_write_failed(struct address_space *mapping, loff_t to) static int adfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; - *pagep = NULL; - ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata, + ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, adfs_get_block, &ADFS_I(mapping->host)->mmu_private); if (unlikely(ret)) diff --git a/fs/affs/file.c b/fs/affs/file.c index 6a6c5bc41b8f..a5a861dd5223 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -417,12 +417,11 @@ affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static int affs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; - *pagep = NULL; - ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata, + ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, affs_get_block, &AFFS_I(mapping->host)->mmu_private); if (unlikely(ret)) @@ -648,7 +647,7 @@ static int affs_read_folio_ofs(struct file *file, struct folio *folio) static int affs_write_begin_ofs(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; struct folio *folio; @@ -671,7 +670,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); - *pagep = &folio->page; + *foliop = folio; if (folio_test_uptodate(folio)) return 0; @@ -881,14 +880,14 @@ affs_truncate(struct inode *inode) if (inode->i_size > AFFS_I(inode)->mmu_private) { struct address_space *mapping = inode->i_mapping; - struct page *page; + struct folio *folio; void *fsdata = NULL; loff_t isize = inode->i_size; int res; - res = mapping->a_ops->write_begin(NULL, mapping, isize, 0, &page, &fsdata); + res = mapping->a_ops->write_begin(NULL, mapping, isize, 0, &folio, &fsdata); if (!res) - res = mapping->a_ops->write_end(NULL, mapping, isize, 0, 0, page_folio(page), fsdata); + res = mapping->a_ops->write_end(NULL, mapping, isize, 0, 0, folio, fsdata); else inode->i_size = AFFS_I(inode)->mmu_private; mark_inode_dirty(inode); diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 46ae9ef3589a..3d410d801825 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -659,7 +659,7 @@ int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc int bch2_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; @@ -728,12 +728,11 @@ out: goto err; } - *pagep = &folio->page; + *foliop = folio; return 0; err: folio_unlock(folio); folio_put(folio); - *pagep = NULL; err_unlock: bch2_pagecache_add_put(inode); kfree(res); diff --git a/fs/bcachefs/fs-io-buffered.h b/fs/bcachefs/fs-io-buffered.h index 16569b9874e0..3207ebbb4ab4 100644 --- a/fs/bcachefs/fs-io-buffered.h +++ b/fs/bcachefs/fs-io-buffered.h @@ -10,8 +10,8 @@ int bch2_read_folio(struct file *, struct folio *); int bch2_writepages(struct address_space *, struct writeback_control *); void bch2_readahead(struct readahead_control *); -int bch2_write_begin(struct file *, struct address_space *, loff_t, - unsigned, struct page **, void **); +int bch2_write_begin(struct file *, struct address_space *, loff_t pos, + unsigned len, struct folio **, void **); int bch2_write_end(struct file *, struct address_space *, loff_t, unsigned len, unsigned copied, struct folio *, void *); diff --git a/fs/bfs/file.c b/fs/bfs/file.c index a778411574a9..fa66a09e496a 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -172,11 +172,11 @@ static void bfs_write_failed(struct address_space *mapping, loff_t to) static int bfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; - ret = block_write_begin(mapping, pos, len, pagep, bfs_get_block); + ret = block_write_begin(mapping, pos, len, foliop, bfs_get_block); if (unlikely(ret)) bfs_write_failed(mapping, pos + len); diff --git a/fs/buffer.c b/fs/buffer.c index 1a0f2f65e890..d52a740f7fca 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2222,7 +2222,7 @@ static void __block_commit_write(struct folio *folio, size_t from, size_t to) * The filesystem needs to handle block truncation upon failure. */ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, get_block_t *get_block) + struct folio **foliop, get_block_t *get_block) { pgoff_t index = pos >> PAGE_SHIFT; struct folio *folio; @@ -2240,7 +2240,7 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, folio = NULL; } - *pagep = &folio->page; + *foliop = folio; return status; } EXPORT_SYMBOL(block_write_begin); @@ -2467,7 +2467,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size) { struct address_space *mapping = inode->i_mapping; const struct address_space_operations *aops = mapping->a_ops; - struct page *page; + struct folio *folio; void *fsdata = NULL; int err; @@ -2475,11 +2475,11 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size) if (err) goto out; - err = aops->write_begin(NULL, mapping, size, 0, &page, &fsdata); + err = aops->write_begin(NULL, mapping, size, 0, &folio, &fsdata); if (err) goto out; - err = aops->write_end(NULL, mapping, size, 0, 0, page_folio(page), fsdata); + err = aops->write_end(NULL, mapping, size, 0, 0, folio, fsdata); BUG_ON(err > 0); out: @@ -2493,7 +2493,7 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; const struct address_space_operations *aops = mapping->a_ops; unsigned int blocksize = i_blocksize(inode); - struct page *page; + struct folio *folio; void *fsdata = NULL; pgoff_t index, curidx; loff_t curpos; @@ -2512,12 +2512,12 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, len = PAGE_SIZE - zerofrom; err = aops->write_begin(file, mapping, curpos, len, - &page, &fsdata); + &folio, &fsdata); if (err) goto out; - zero_user(page, zerofrom, len); + folio_zero_range(folio, offset_in_folio(folio, curpos), len); err = aops->write_end(file, mapping, curpos, len, len, - page_folio(page), fsdata); + folio, fsdata); if (err < 0) goto out; BUG_ON(err != len); @@ -2545,12 +2545,12 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, len = offset - zerofrom; err = aops->write_begin(file, mapping, curpos, len, - &page, &fsdata); + &folio, &fsdata); if (err) goto out; - zero_user(page, zerofrom, len); + folio_zero_range(folio, offset_in_folio(folio, curpos), len); err = aops->write_end(file, mapping, curpos, len, len, - page_folio(page), fsdata); + folio, fsdata); if (err < 0) goto out; BUG_ON(err != len); @@ -2566,7 +2566,7 @@ out: */ int cont_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata, + struct folio **foliop, void **fsdata, get_block_t *get_block, loff_t *bytes) { struct inode *inode = mapping->host; @@ -2584,7 +2584,7 @@ int cont_write_begin(struct file *file, struct address_space *mapping, (*bytes)++; } - return block_write_begin(mapping, pos, len, pagep, get_block); + return block_write_begin(mapping, pos, len, foliop, get_block); } EXPORT_SYMBOL(cont_write_begin); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 87369e2659c7..4402cddf82b5 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1486,20 +1486,18 @@ static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned */ static int ceph_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct folio *folio = NULL; int r; - r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, &folio, NULL); + r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, foliop, NULL); if (r < 0) return r; - folio_wait_private_2(folio); /* [DEPRECATED] */ - WARN_ON_ONCE(!folio_test_locked(folio)); - *pagep = &folio->page; + folio_wait_private_2(*foliop); /* [DEPRECATED] */ + WARN_ON_ONCE(!folio_test_locked(*foliop)); return 0; } diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index f43e42ede75e..287e5d407f08 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -255,7 +255,7 @@ out: * @mapping: The eCryptfs object * @pos: The file offset at which to start writing * @len: Length of the write - * @pagep: Pointer to return the page + * @foliop: Pointer to return the folio * @fsdata: Pointer to return fs data (unused) * * This function must zero any hole we create @@ -265,7 +265,7 @@ out: static int ecryptfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { pgoff_t index = pos >> PAGE_SHIFT; struct folio *folio; @@ -276,7 +276,7 @@ static int ecryptfs_write_begin(struct file *file, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); - *pagep = &folio->page; + *foliop = folio; prev_page_end_size = ((loff_t)index << PAGE_SHIFT); if (!folio_test_uptodate(folio)) { @@ -365,7 +365,6 @@ out: if (unlikely(rc)) { folio_unlock(folio); folio_put(folio); - *pagep = NULL; } return rc; } diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 7144472d092e..e19469e88000 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -535,20 +535,20 @@ static int exfat_file_zeroed_range(struct file *file, loff_t start, loff_t end) while (start < end) { u32 zerofrom, len; - struct page *page = NULL; + struct folio *folio; zerofrom = start & (PAGE_SIZE - 1); len = PAGE_SIZE - zerofrom; if (start + len > end) len = end - start; - err = ops->write_begin(file, mapping, start, len, &page, NULL); + err = ops->write_begin(file, mapping, start, len, &folio, NULL); if (err) goto out; - zero_user_segment(page, zerofrom, zerofrom + len); + folio_zero_range(folio, offset_in_folio(folio, start), len); - err = ops->write_end(file, mapping, start, len, len, page_folio(page), NULL); + err = ops->write_end(file, mapping, start, len, len, folio, NULL); if (err < 0) goto out; start += len; diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 871e9e3e407e..05f0e07b01d0 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -448,12 +448,11 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to) static int exfat_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned int len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; - *pagep = NULL; - ret = block_write_begin(mapping, pos, len, pagep, exfat_get_block); + ret = block_write_begin(mapping, pos, len, foliop, exfat_get_block); if (ret < 0) exfat_write_failed(mapping, pos+len); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index aba41f6150e5..30f8201c155f 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -916,11 +916,11 @@ static void ext2_readahead(struct readahead_control *rac) static int ext2_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, void **fsdata) + loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { int ret; - ret = block_write_begin(mapping, pos, len, pagep, ext2_get_block); + ret = block_write_begin(mapping, pos, len, foliop, ext2_get_block); if (ret < 0) ext2_write_failed(mapping, pos + len); return ret; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 08acd152261e..c3429b664b8d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3563,13 +3563,13 @@ int ext4_readpage_inline(struct inode *inode, struct folio *folio); extern int ext4_try_to_write_inline_data(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, - struct page **pagep); + struct folio **foliop); int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct folio *folio); extern int ext4_da_write_inline_data_begin(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, - struct page **pagep, + struct folio **foliop, void **fsdata); extern int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index e7a09a99837b..b7ea9cb4c398 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -660,7 +660,7 @@ out_nofolio: int ext4_try_to_write_inline_data(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, - struct page **pagep) + struct folio **foliop) { int ret; handle_t *handle; @@ -708,7 +708,7 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, goto out; } - *pagep = &folio->page; + *foliop = folio; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { ret = 0; @@ -891,7 +891,7 @@ out: int ext4_da_write_inline_data_begin(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, - struct page **pagep, + struct folio **foliop, void **fsdata) { int ret; @@ -954,7 +954,7 @@ retry_journal: goto out_release_page; up_read(&EXT4_I(inode)->xattr_sem); - *pagep = &folio->page; + *foliop = folio; brelse(iloc.bh); return 1; out_release_page: diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e19ac0a82bdc..90ef8ec5c59b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1145,7 +1145,7 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, */ static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; int ret, needed_blocks; @@ -1170,7 +1170,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, - pagep); + foliop); if (ret < 0) return ret; if (ret == 1) @@ -1270,7 +1270,7 @@ retry_journal: folio_put(folio); return ret; } - *pagep = &folio->page; + *foliop = folio; return ret; } @@ -2924,7 +2924,7 @@ static int ext4_nonda_switch(struct super_block *sb) static int ext4_da_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret, retries = 0; struct folio *folio; @@ -2939,14 +2939,14 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; return ext4_write_begin(file, mapping, pos, - len, pagep, fsdata); + len, foliop, fsdata); } *fsdata = (void *)0; trace_ext4_da_write_begin(inode, pos, len); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len, - pagep, fsdata); + foliop, fsdata); if (ret < 0) return ret; if (ret == 1) @@ -2981,7 +2981,7 @@ retry: return ret; } - *pagep = &folio->page; + *foliop = folio; return ret; } diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index 86ef272296ab..d9203228ce97 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -76,17 +76,17 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count, while (count) { size_t n = min_t(size_t, count, PAGE_SIZE - offset_in_page(pos)); - struct page *page; + struct folio *folio; void *fsdata = NULL; int res; - res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata); + res = aops->write_begin(NULL, mapping, pos, n, &folio, &fsdata); if (res) return res; - memcpy_to_page(page, offset_in_page(pos), buf, n); + memcpy_to_folio(folio, offset_in_folio(folio, pos), buf, n); - res = aops->write_end(NULL, mapping, pos, n, n, page_folio(page), fsdata); + res = aops->write_end(NULL, mapping, pos, n, n, folio, fsdata); if (res < 0) return res; if (res != n) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 4c32e2cfc403..5dfa0207ad8f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3552,7 +3552,7 @@ reserve_block: } static int f2fs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, void **fsdata) + loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -3584,18 +3584,20 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { int ret; + struct page *page; *fsdata = NULL; if (len == PAGE_SIZE && !(f2fs_is_atomic_file(inode))) goto repeat; - ret = f2fs_prepare_compress_overwrite(inode, pagep, + ret = f2fs_prepare_compress_overwrite(inode, &page, index, fsdata); if (ret < 0) { err = ret; goto fail; } else if (ret) { + *foliop = page_folio(page); return 0; } } @@ -3615,7 +3617,7 @@ repeat: /* TODO: cluster can be compressed due to race with .writepage */ - *pagep = &folio->page; + *foliop = folio; if (f2fs_is_atomic_file(inode)) err = prepare_atomic_write_begin(sbi, &folio->page, pos, len, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index cfd38cd4f82c..176b5177c89d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2677,7 +2677,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, const struct address_space_operations *a_ops = mapping->a_ops; int offset = off & (sb->s_blocksize - 1); size_t towrite = len; - struct page *page; + struct folio *folio; void *fsdata = NULL; int err = 0; int tocopy; @@ -2687,7 +2687,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, towrite); retry: err = a_ops->write_begin(NULL, mapping, off, tocopy, - &page, &fsdata); + &folio, &fsdata); if (unlikely(err)) { if (err == -ENOMEM) { f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); @@ -2697,10 +2697,10 @@ retry: break; } - memcpy_to_page(page, offset, data, tocopy); + memcpy_to_folio(folio, offset_in_folio(folio, off), data, tocopy); a_ops->write_end(NULL, mapping, off, tocopy, tocopy, - page_folio(page), fsdata); + folio, fsdata); offset = 0; towrite -= tocopy; off += tocopy; diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 486512144bf1..84a33fe49bed 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -80,17 +80,17 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count, while (count) { size_t n = min_t(size_t, count, PAGE_SIZE - offset_in_page(pos)); - struct page *page; + struct folio *folio; void *fsdata = NULL; int res; - res = aops->write_begin(NULL, mapping, pos, n, &page, &fsdata); + res = aops->write_begin(NULL, mapping, pos, n, &folio, &fsdata); if (res) return res; - memcpy_to_page(page, offset_in_page(pos), buf, n); + memcpy_to_folio(folio, offset_in_folio(folio, pos), buf, n); - res = aops->write_end(NULL, mapping, pos, n, n, page_folio(page), fsdata); + res = aops->write_end(NULL, mapping, pos, n, n, folio, fsdata); if (res < 0) return res; if (res != n) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 8ed02b17d591..75722bbd6b5f 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -221,13 +221,12 @@ static void fat_write_failed(struct address_space *mapping, loff_t to) static int fat_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int err; - *pagep = NULL; err = cont_write_begin(file, mapping, pos, len, - pagep, fsdata, fat_get_block, + foliop, fsdata, fat_get_block, &MSDOS_I(mapping->host)->mmu_private); if (err < 0) fat_write_failed(mapping, pos + len); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a5cd6e4f9b2b..bc49b2eeadf3 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2387,7 +2387,7 @@ out: * but how to implement it without killing performance need more thinking. */ static int fuse_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, void **fsdata) + loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { pgoff_t index = pos >> PAGE_SHIFT; struct fuse_conn *fc = get_fuse_conn(file_inode(file)); @@ -2421,7 +2421,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping, if (err) goto cleanup; success: - *pagep = &folio->page; + *foliop = folio; return 0; cleanup: diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c index 30512dbb79f0..4a0ce131e233 100644 --- a/fs/hfs/extent.c +++ b/fs/hfs/extent.c @@ -487,15 +487,15 @@ void hfs_file_truncate(struct inode *inode) if (inode->i_size > HFS_I(inode)->phys_size) { struct address_space *mapping = inode->i_mapping; void *fsdata = NULL; - struct page *page; + struct folio *folio; /* XXX: Can use generic_cont_expand? */ size = inode->i_size - 1; - res = hfs_write_begin(NULL, mapping, size + 1, 0, &page, + res = hfs_write_begin(NULL, mapping, size + 1, 0, &folio, &fsdata); if (!res) { res = generic_write_end(NULL, mapping, size + 1, 0, 0, - page_folio(page), fsdata); + folio, fsdata); } if (res) inode->i_size = HFS_I(inode)->phys_size; diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index b5a6ad5df357..a0c7cb0f79fc 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -202,7 +202,7 @@ extern const struct address_space_operations hfs_aops; extern const struct address_space_operations hfs_btree_aops; int hfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, void **fsdata); + loff_t pos, unsigned len, struct folio **foliop, void **fsdata); extern struct inode *hfs_new_inode(struct inode *, const struct qstr *, umode_t); extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *); extern int hfs_write_inode(struct inode *, struct writeback_control *); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 744e10b46904..a81ce7a740b9 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -45,12 +45,11 @@ static void hfs_write_failed(struct address_space *mapping, loff_t to) } int hfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, void **fsdata) + loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { int ret; - *pagep = NULL; - ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata, + ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, hfs_get_block, &HFS_I(mapping->host)->phys_size); if (unlikely(ret)) diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index 776c0ea722cb..a6d61685ae79 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -554,16 +554,16 @@ void hfsplus_file_truncate(struct inode *inode) if (inode->i_size > hip->phys_size) { struct address_space *mapping = inode->i_mapping; - struct page *page; + struct folio *folio; void *fsdata = NULL; loff_t size = inode->i_size; res = hfsplus_write_begin(NULL, mapping, size, 0, - &page, &fsdata); + &folio, &fsdata); if (res) return; res = generic_write_end(NULL, mapping, size, 0, 0, - page_folio(page), fsdata); + folio, fsdata); if (res < 0) return; mark_inode_dirty(inode); diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 9e78f181c24f..59ce81dca73f 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -472,7 +472,7 @@ extern const struct address_space_operations hfsplus_btree_aops; extern const struct dentry_operations hfsplus_dentry_operations; int hfsplus_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, void **fsdata); + loff_t pos, unsigned len, struct folio **foliop, void **fsdata); struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir, umode_t mode); void hfsplus_delete_inode(struct inode *inode); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 3d326926c195..f331e9574217 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -39,12 +39,11 @@ static void hfsplus_write_failed(struct address_space *mapping, loff_t to) } int hfsplus_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, void **fsdata) + loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { int ret; - *pagep = NULL; - ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata, + ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, hfsplus_get_block, &HFSPLUS_I(mapping->host)->phys_size); if (unlikely(ret)) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index d0cbc5ece952..6d1cf2436ead 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -465,12 +465,13 @@ static int hostfs_read_folio(struct file *file, struct folio *folio) static int hostfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { pgoff_t index = pos >> PAGE_SHIFT; - *pagep = grab_cache_page_write_begin(mapping, index); - if (!*pagep) + *foliop = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (!*foliop) return -ENOMEM; return 0; } diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 11e2d9e612ac..449a3fc1b8d9 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -190,12 +190,11 @@ static void hpfs_write_failed(struct address_space *mapping, loff_t to) static int hpfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; - *pagep = NULL; - ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata, + ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, hpfs_get_block, &hpfs_i(mapping->host)->mmu_private); if (unlikely(ret)) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index b7a30055e6f4..5cf327337e22 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -388,7 +388,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) static int hugetlbfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { return -EINVAL; } diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index b6fe1f1e8ea9..ada572c466f8 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -26,7 +26,7 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, struct folio *folio, void *fsdata); static int jffs2_write_begin(struct file *filp, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata); + struct folio **foliop, void **fsdata); static int jffs2_read_folio(struct file *filp, struct folio *folio); int jffs2_fsync(struct file *filp, loff_t start, loff_t end, int datasync) @@ -125,7 +125,7 @@ static int jffs2_read_folio(struct file *file, struct folio *folio) static int jffs2_write_begin(struct file *filp, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct folio *folio; struct inode *inode = mapping->host; @@ -212,7 +212,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, ret = PTR_ERR(folio); goto release_sem; } - *pagep = &folio->page; + *foliop = folio; /* * Read in the folio if it wasn't already present. Cannot optimize away diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index d63494182548..07cfdc440596 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -292,11 +292,11 @@ static void jfs_write_failed(struct address_space *mapping, loff_t to) static int jfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; - ret = block_write_begin(mapping, pos, len, pagep, jfs_get_block); + ret = block_write_begin(mapping, pos, len, foliop, jfs_get_block); if (unlikely(ret)) jfs_write_failed(mapping, pos + len); diff --git a/fs/libfs.c b/fs/libfs.c index 4581be1d5b32..1a776edb39de 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -901,7 +901,7 @@ static int simple_read_folio(struct file *file, struct folio *folio) int simple_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct folio *folio; @@ -910,7 +910,7 @@ int simple_write_begin(struct file *file, struct address_space *mapping, if (IS_ERR(folio)) return PTR_ERR(folio); - *pagep = &folio->page; + *foliop = folio; if (!folio_test_uptodate(folio) && (len != folio_size(folio))) { size_t from = offset_in_folio(folio, pos); diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 0002337977e0..abb190c46c04 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -444,11 +444,11 @@ static void minix_write_failed(struct address_space *mapping, loff_t to) static int minix_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; - ret = block_write_begin(mapping, pos, len, pagep, minix_get_block); + ret = block_write_begin(mapping, pos, len, foliop, minix_get_block); if (unlikely(ret)) minix_write_failed(mapping, pos + len); diff --git a/fs/namei.c b/fs/namei.c index fb7055245e68..1bf081959066 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -5304,7 +5304,7 @@ int page_symlink(struct inode *inode, const char *symname, int len) struct address_space *mapping = inode->i_mapping; const struct address_space_operations *aops = mapping->a_ops; bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS); - struct page *page; + struct folio *folio; void *fsdata = NULL; int err; unsigned int flags; @@ -5312,16 +5312,16 @@ int page_symlink(struct inode *inode, const char *symname, int len) retry: if (nofs) flags = memalloc_nofs_save(); - err = aops->write_begin(NULL, mapping, 0, len-1, &page, &fsdata); + err = aops->write_begin(NULL, mapping, 0, len-1, &folio, &fsdata); if (nofs) memalloc_nofs_restore(flags); if (err) goto fail; - memcpy(page_address(page), symname, len-1); + memcpy(folio_address(folio), symname, len - 1); - err = aops->write_end(NULL, mapping, 0, len-1, len-1, - page_folio(page), fsdata); + err = aops->write_end(NULL, mapping, 0, len - 1, len - 1, + folio, fsdata); if (err < 0) goto fail; if (err < len-1) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index fe583dbebe54..6800ee92d742 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -336,7 +336,7 @@ static bool nfs_want_read_modify_write(struct file *file, struct folio *folio, * increment the page use counts until he is done with the page. */ static int nfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, + loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { fgf_t fgp = FGP_WRITEBEGIN; @@ -353,7 +353,7 @@ start: mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); - *pagep = &folio->page; + *foliop = folio; ret = nfs_flush_incompatible(file, folio); if (ret) { diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 25841b75415a..8661f452dba6 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -250,7 +250,7 @@ void nilfs_write_failed(struct address_space *mapping, loff_t to) static int nilfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; @@ -259,7 +259,7 @@ static int nilfs_write_begin(struct file *file, struct address_space *mapping, if (unlikely(err)) return err; - err = block_write_begin(mapping, pos, len, pagep, nilfs_get_block); + err = block_write_begin(mapping, pos, len, foliop, nilfs_get_block); if (unlikely(err)) { nilfs_write_failed(mapping, pos + len); nilfs_transaction_abort(inode->i_sb); diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index 40c5dfbc9d41..c2e60a6bd060 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -498,7 +498,6 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, struct inode *inode; struct nilfs_recovery_block *rb, *n; unsigned int blocksize = nilfs->ns_blocksize; - struct page *page; struct folio *folio; loff_t pos; int err = 0, err2 = 0; @@ -513,7 +512,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, pos = rb->blkoff << inode->i_blkbits; err = block_write_begin(inode->i_mapping, pos, blocksize, - &page, nilfs_get_block); + &folio, nilfs_get_block); if (unlikely(err)) { loff_t isize = inode->i_size; @@ -523,8 +522,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, goto failed_inode; } - folio = page_folio(page); - err = nilfs_recovery_copy_block(nilfs, rb, pos, page); + err = nilfs_recovery_copy_block(nilfs, rb, pos, &folio->page); if (unlikely(err)) goto failed_page; diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 88a7d81cf2ba..6202895a4542 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -182,7 +182,7 @@ static int ntfs_extend_initialized_size(struct file *file, for (;;) { u32 zerofrom, len; - struct page *page; + struct folio *folio; u8 bits; CLST vcn, lcn, clen; @@ -208,14 +208,13 @@ static int ntfs_extend_initialized_size(struct file *file, if (pos + len > new_valid) len = new_valid - pos; - err = ntfs_write_begin(file, mapping, pos, len, &page, NULL); + err = ntfs_write_begin(file, mapping, pos, len, &folio, NULL); if (err) goto out; - zero_user_segment(page, zerofrom, PAGE_SIZE); + folio_zero_range(folio, zerofrom, folio_size(folio)); - /* This function in any case puts page. */ - err = ntfs_write_end(file, mapping, pos, len, len, page_folio(page), NULL); + err = ntfs_write_end(file, mapping, pos, len, len, folio, NULL); if (err < 0) goto out; pos += len; diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 9efd3f7c59d6..f672072e6bd4 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -901,7 +901,7 @@ static int ntfs_get_block_write_begin(struct inode *inode, sector_t vbn, } int ntfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, u32 len, struct page **pagep, void **fsdata) + loff_t pos, u32 len, struct folio **foliop, void **fsdata) { int err; struct inode *inode = mapping->host; @@ -910,7 +910,6 @@ int ntfs_write_begin(struct file *file, struct address_space *mapping, if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) return -EIO; - *pagep = NULL; if (is_resident(ni)) { struct folio *folio = __filemap_get_folio( mapping, pos >> PAGE_SHIFT, FGP_WRITEBEGIN, @@ -926,7 +925,7 @@ int ntfs_write_begin(struct file *file, struct address_space *mapping, ni_unlock(ni); if (!err) { - *pagep = &folio->page; + *foliop = folio; goto out; } folio_unlock(folio); @@ -936,7 +935,7 @@ int ntfs_write_begin(struct file *file, struct address_space *mapping, goto out; } - err = block_write_begin(mapping, pos, len, pagep, + err = block_write_begin(mapping, pos, len, foliop, ntfs_get_block_write_begin); out: diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 53517281f26b..584f814715f4 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -711,7 +711,7 @@ int ntfs_set_size(struct inode *inode, u64 new_size); int ntfs_get_block(struct inode *inode, sector_t vbn, struct buffer_head *bh_result, int create); int ntfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, u32 len, struct page **pagep, void **fsdata); + loff_t pos, u32 len, struct folio **foliop, void **fsdata); int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, u32 len, u32 copied, struct folio *folio, void *fsdata); int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 9d8aa417e8da..d6c985cc6353 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1643,7 +1643,7 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh, int ocfs2_write_begin_nolock(struct address_space *mapping, loff_t pos, unsigned len, ocfs2_write_type_t type, - struct page **pagep, void **fsdata, + struct folio **foliop, void **fsdata, struct buffer_head *di_bh, struct page *mmap_page) { int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS; @@ -1826,8 +1826,8 @@ try_again: ocfs2_free_alloc_context(meta_ac); success: - if (pagep) - *pagep = wc->w_target_page; + if (foliop) + *foliop = page_folio(wc->w_target_page); *fsdata = wc; return 0; out_quota: @@ -1879,7 +1879,7 @@ out: static int ocfs2_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; struct buffer_head *di_bh = NULL; @@ -1901,7 +1901,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping, down_write(&OCFS2_I(inode)->ip_alloc_sem); ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_BUFFER, - pagep, fsdata, di_bh, NULL); + foliop, fsdata, di_bh, NULL); if (ret) { mlog_errno(ret); goto out_fail; diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 3a520117fa59..45db1781ea73 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -38,7 +38,7 @@ typedef enum { int ocfs2_write_begin_nolock(struct address_space *mapping, loff_t pos, unsigned len, ocfs2_write_type_t type, - struct page **pagep, void **fsdata, + struct folio **foliop, void **fsdata, struct buffer_head *di_bh, struct page *mmap_page); int ocfs2_read_inline_data(struct inode *inode, struct page *page, diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 1834f26522ed..6ef4cb045ccd 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -53,7 +53,7 @@ static vm_fault_t __ocfs2_page_mkwrite(struct file *file, loff_t pos = page_offset(page); unsigned int len = PAGE_SIZE; pgoff_t last_index; - struct page *locked_page = NULL; + struct folio *locked_folio = NULL; void *fsdata; loff_t size = i_size_read(inode); @@ -91,7 +91,7 @@ static vm_fault_t __ocfs2_page_mkwrite(struct file *file, len = ((size - 1) & ~PAGE_MASK) + 1; err = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP, - &locked_page, &fsdata, di_bh, page); + &locked_folio, &fsdata, di_bh, page); if (err) { if (err != -ENOSPC) mlog_errno(err); @@ -99,7 +99,7 @@ static vm_fault_t __ocfs2_page_mkwrite(struct file *file, goto out; } - if (!locked_page) { + if (!locked_folio) { ret = VM_FAULT_NOPAGE; goto out; } diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 6b580b9da8e3..98358d405b6a 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -312,11 +312,11 @@ static void omfs_write_failed(struct address_space *mapping, loff_t to) static int omfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; - ret = block_write_begin(mapping, pos, len, pagep, omfs_get_block); + ret = block_write_begin(mapping, pos, len, foliop, omfs_get_block); if (unlikely(ret)) omfs_write_failed(mapping, pos + len); diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 69b507a611f8..aae6d2b8767d 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -309,7 +309,7 @@ static int orangefs_read_folio(struct file *file, struct folio *folio) static int orangefs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct orangefs_write_range *wr; struct folio *folio; @@ -320,7 +320,7 @@ static int orangefs_write_begin(struct file *file, if (IS_ERR(folio)) return PTR_ERR(folio); - *pagep = &folio->page; + *foliop = folio; if (folio_test_dirty(folio) && !folio_test_private(folio)) { /* diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index f4b9db4b4df2..ea23872ba24f 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2735,7 +2735,7 @@ static void reiserfs_truncate_failed_write(struct inode *inode) static int reiserfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct inode *inode; struct folio *folio; @@ -2749,7 +2749,7 @@ static int reiserfs_write_begin(struct file *file, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); - *pagep = &folio->page; + *foliop = folio; reiserfs_wait_on_write_block(inode->i_sb); fix_tail_page_for_writing(&folio->page); diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index c8511e286673..8864438817a6 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -483,11 +483,11 @@ static void sysv_write_failed(struct address_space *mapping, loff_t to) static int sysv_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; - ret = block_write_begin(mapping, pos, len, pagep, get_block); + ret = block_write_begin(mapping, pos, len, foliop, get_block); if (unlikely(ret)) sysv_write_failed(mapping, pos + len); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index baa3eecc32fe..5130123005e4 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -211,7 +211,7 @@ static void release_existing_page_budget(struct ubifs_info *c) } static int write_begin_slow(struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep) + loff_t pos, unsigned len, struct folio **foliop) { struct inode *inode = mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; @@ -298,7 +298,7 @@ static int write_begin_slow(struct address_space *mapping, ubifs_release_dirty_inode_budget(c, ui); } - *pagep = &folio->page; + *foliop = folio; return 0; } @@ -414,7 +414,7 @@ static int allocate_budget(struct ubifs_info *c, struct folio *folio, */ static int ubifs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; @@ -483,7 +483,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, folio_unlock(folio); folio_put(folio); - return write_begin_slow(mapping, pos, len, pagep); + return write_begin_slow(mapping, pos, len, foliop); } /* @@ -492,7 +492,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, * with @ui->ui_mutex locked if we are appending pages, and unlocked * otherwise. This is an optimization (slightly hacky though). */ - *pagep = &folio->page; + *foliop = folio; return 0; } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index fdf024e0b772..eaee57b91c6c 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -246,14 +246,14 @@ static void udf_readahead(struct readahead_control *rac) static int udf_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct udf_inode_info *iinfo = UDF_I(file_inode(file)); struct folio *folio; int ret; if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { - ret = block_write_begin(mapping, pos, len, pagep, + ret = block_write_begin(mapping, pos, len, foliop, udf_get_block); if (unlikely(ret)) udf_write_failed(mapping, pos + len); @@ -265,7 +265,7 @@ static int udf_write_begin(struct file *file, struct address_space *mapping, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); - *pagep = &folio->page; + *foliop = folio; if (!folio_test_uptodate(folio)) udf_adinicb_read_folio(folio); return 0; diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 69adb198c634..f43461652d9f 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -498,11 +498,11 @@ static void ufs_write_failed(struct address_space *mapping, loff_t to) static int ufs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { int ret; - ret = block_write_begin(mapping, pos, len, pagep, ufs_getfrag_block); + ret = block_write_begin(mapping, pos, len, foliop, ufs_getfrag_block); if (unlikely(ret)) ufs_write_failed(mapping, pos + len); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 165e859664a5..254563a2a9de 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -258,7 +258,7 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio, int block_read_full_folio(struct folio *, get_block_t *); bool block_is_partially_uptodate(struct folio *, size_t from, size_t count); int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, get_block_t *get_block); + struct folio **foliop, get_block_t *get_block); int __block_write_begin(struct page *page, loff_t pos, unsigned len, get_block_t *get_block); int block_write_end(struct file *, struct address_space *, @@ -269,7 +269,7 @@ int generic_write_end(struct file *, struct address_space *, struct folio *, void *); void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to); int cont_write_begin(struct file *, struct address_space *, loff_t, - unsigned, struct page **, void **, + unsigned, struct folio **, void **, get_block_t *, loff_t *); int generic_cont_expand_simple(struct inode *inode, loff_t size); void block_commit_write(struct page *page, unsigned int from, unsigned int to); diff --git a/include/linux/fs.h b/include/linux/fs.h index 1af3373c7cd2..7a79b88a8f5a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -408,7 +408,7 @@ struct address_space_operations { int (*write_begin)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata); + struct folio **foliop, void **fsdata); int (*write_end)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata); @@ -3331,7 +3331,7 @@ extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter); extern int simple_empty(struct dentry *); extern int simple_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata); + struct folio **foliop, void **fsdata); extern const struct address_space_operations ram_aops; extern int always_delete_dentry(const struct dentry *); extern struct inode *alloc_anon_inode(struct super_block *); diff --git a/mm/filemap.c b/mm/filemap.c index fab6b0c3044e..29fec1fccd0a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3987,7 +3987,6 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i) ssize_t written = 0; do { - struct page *page; struct folio *folio; size_t offset; /* Offset into folio */ size_t bytes; /* Bytes to write to folio */ @@ -4017,11 +4016,10 @@ retry: } status = a_ops->write_begin(file, mapping, pos, bytes, - &page, &fsdata); + &folio, &fsdata); if (unlikely(status < 0)) break; - folio = page_folio(page); offset = offset_in_folio(folio, pos); if (bytes > folio_size(folio) - offset) bytes = folio_size(folio) - offset; diff --git a/mm/shmem.c b/mm/shmem.c index 1116f147d788..dbc351b4b153 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2882,7 +2882,7 @@ static const struct inode_operations shmem_short_symlink_operations; static int shmem_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata) + struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); @@ -2903,14 +2903,14 @@ shmem_write_begin(struct file *file, struct address_space *mapping, if (ret) return ret; - *pagep = folio_file_page(folio, index); - if (PageHWPoison(*pagep)) { + if (folio_test_hwpoison(folio) || + (folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) { folio_unlock(folio); folio_put(folio); - *pagep = NULL; return -EIO; } + *foliop = folio; return 0; } From 7f90d7f1bc9418aba1ef040565308e83dbe0f485 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 10 Jul 2024 23:04:59 -0400 Subject: [PATCH 054/168] ocfs2: Convert ocfs2_write_zero_page to use a folio Removes a conversion of folio to page, and then two hidden conversions of page back to folio. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/ocfs2/file.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index ccc57038a977..cfec1782ac2a 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -755,7 +755,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, u64 abs_to, struct buffer_head *di_bh) { struct address_space *mapping = inode->i_mapping; - struct page *page; + struct folio *folio; unsigned long index = abs_from >> PAGE_SHIFT; handle_t *handle; int ret = 0; @@ -774,9 +774,10 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, goto out; } - page = find_or_create_page(mapping, index, GFP_NOFS); - if (!page) { - ret = -ENOMEM; + folio = __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_NOFS); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); mlog_errno(ret); goto out_commit_trans; } @@ -803,7 +804,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, * __block_write_begin and block_commit_write to zero the * whole block. */ - ret = __block_write_begin(page, block_start + 1, 0, + ret = __block_write_begin(&folio->page, block_start + 1, 0, ocfs2_get_block); if (ret < 0) { mlog_errno(ret); @@ -812,7 +813,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, /* must not update i_size! */ - block_commit_write(page, block_start + 1, block_start + 1); + block_commit_write(&folio->page, block_start + 1, block_start + 1); } /* @@ -833,8 +834,8 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, } out_unlock: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); out_commit_trans: if (handle) ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); From 9f04609f74ec7a439e1ac42da5db9e6ddf4f7b13 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 10 Jul 2024 23:09:04 -0400 Subject: [PATCH 055/168] buffer: Convert __block_write_begin() to take a folio Almost all callers have a folio now, so change __block_write_begin() to take a folio and remove a call to compound_head(). Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/buffer.c | 5 ++--- fs/ext2/dir.c | 2 +- fs/ext4/inline.c | 6 +++--- fs/ext4/inode.c | 8 ++++---- fs/minix/inode.c | 2 +- fs/nilfs2/dir.c | 2 +- fs/ocfs2/file.c | 2 +- fs/reiserfs/inode.c | 6 +++--- fs/sysv/itree.c | 2 +- fs/udf/file.c | 2 +- fs/ufs/inode.c | 2 +- include/linux/buffer_head.h | 2 +- 12 files changed, 20 insertions(+), 21 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index d52a740f7fca..45eb06cb1a4e 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2168,11 +2168,10 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, return err; } -int __block_write_begin(struct page *page, loff_t pos, unsigned len, +int __block_write_begin(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block) { - return __block_write_begin_int(page_folio(page), pos, len, get_block, - NULL); + return __block_write_begin_int(folio, pos, len, get_block, NULL); } EXPORT_SYMBOL(__block_write_begin); diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 60605fbdd0eb..2f3042d0174c 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -434,7 +434,7 @@ int ext2_inode_by_name(struct inode *dir, const struct qstr *child, ino_t *ino) static int ext2_prepare_chunk(struct folio *folio, loff_t pos, unsigned len) { - return __block_write_begin(&folio->page, pos, len, ext2_get_block); + return __block_write_begin(folio, pos, len, ext2_get_block); } static int ext2_handle_dirsync(struct inode *dir) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index b7ea9cb4c398..d5c2f3b75084 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -601,10 +601,10 @@ retry: goto out; if (ext4_should_dioread_nolock(inode)) { - ret = __block_write_begin(&folio->page, from, to, + ret = __block_write_begin(folio, from, to, ext4_get_block_unwritten); } else - ret = __block_write_begin(&folio->page, from, to, ext4_get_block); + ret = __block_write_begin(folio, from, to, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, @@ -856,7 +856,7 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, goto out; } - ret = __block_write_begin(&folio->page, 0, inline_size, + ret = __block_write_begin(folio, 0, inline_size, ext4_da_get_block_prep); if (ret) { up_read(&EXT4_I(inode)->xattr_sem); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 90ef8ec5c59b..03374dc215d1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1224,10 +1224,10 @@ retry_journal: ret = ext4_block_write_begin(folio, pos, len, ext4_get_block); #else if (ext4_should_dioread_nolock(inode)) - ret = __block_write_begin(&folio->page, pos, len, + ret = __block_write_begin(folio, pos, len, ext4_get_block_unwritten); else - ret = __block_write_begin(&folio->page, pos, len, ext4_get_block); + ret = __block_write_begin(folio, pos, len, ext4_get_block); #endif if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, @@ -2962,7 +2962,7 @@ retry: #ifdef CONFIG_FS_ENCRYPTION ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep); #else - ret = __block_write_begin(&folio->page, pos, len, ext4_da_get_block_prep); + ret = __block_write_begin(folio, pos, len, ext4_da_get_block_prep); #endif if (ret < 0) { folio_unlock(folio); @@ -6216,7 +6216,7 @@ retry_alloc: if (folio_pos(folio) + len > size) len = size - folio_pos(folio); - err = __block_write_begin(&folio->page, 0, len, ext4_get_block); + err = __block_write_begin(folio, 0, len, ext4_get_block); if (!err) { ret = VM_FAULT_SIGBUS; if (ext4_journal_folio_buffers(handle, folio, len)) diff --git a/fs/minix/inode.c b/fs/minix/inode.c index abb190c46c04..f007e389d5d2 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -429,7 +429,7 @@ static int minix_read_folio(struct file *file, struct folio *folio) int minix_prepare_chunk(struct folio *folio, loff_t pos, unsigned len) { - return __block_write_begin(&folio->page, pos, len, minix_get_block); + return __block_write_begin(folio, pos, len, minix_get_block); } static void minix_write_failed(struct address_space *mapping, loff_t to) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 66af42f88ca7..4b3e19d74925 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -83,7 +83,7 @@ static int nilfs_prepare_chunk(struct folio *folio, unsigned int from, { loff_t pos = folio_pos(folio) + from; - return __block_write_begin(&folio->page, pos, to - from, nilfs_get_block); + return __block_write_begin(folio, pos, to - from, nilfs_get_block); } static void nilfs_commit_chunk(struct folio *folio, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index cfec1782ac2a..d058eb30cfc2 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -804,7 +804,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, * __block_write_begin and block_commit_write to zero the * whole block. */ - ret = __block_write_begin(&folio->page, block_start + 1, 0, + ret = __block_write_begin(folio, block_start + 1, 0, ocfs2_get_block); if (ret < 0) { mlog_errno(ret); diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index ea23872ba24f..72c53129c952 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2198,7 +2198,7 @@ static int grab_tail_page(struct inode *inode, /* start within the page of the last block in the file */ start = (offset / blocksize) * blocksize; - error = __block_write_begin(&folio->page, start, offset - start, + error = __block_write_begin(folio, start, offset - start, reiserfs_get_block_create_0); if (error) goto unlock; @@ -2762,7 +2762,7 @@ static int reiserfs_write_begin(struct file *file, old_ref = th->t_refcount; th->t_refcount++; } - ret = __block_write_begin(&folio->page, pos, len, reiserfs_get_block); + ret = __block_write_begin(folio, pos, len, reiserfs_get_block); if (ret && reiserfs_transaction_running(inode->i_sb)) { struct reiserfs_transaction_handle *th = current->journal_info; /* @@ -2822,7 +2822,7 @@ int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len) th->t_refcount++; } - ret = __block_write_begin(page, from, len, reiserfs_get_block); + ret = __block_write_begin(page_folio(page), from, len, reiserfs_get_block); if (ret && reiserfs_transaction_running(inode->i_sb)) { struct reiserfs_transaction_handle *th = current->journal_info; /* diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index 8864438817a6..451e95f474fa 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -468,7 +468,7 @@ static int sysv_read_folio(struct file *file, struct folio *folio) int sysv_prepare_chunk(struct folio *folio, loff_t pos, unsigned len) { - return __block_write_begin(&folio->page, pos, len, get_block); + return __block_write_begin(folio, pos, len, get_block); } static void sysv_write_failed(struct address_space *mapping, loff_t to) diff --git a/fs/udf/file.c b/fs/udf/file.c index 3a4179de316b..412fe7c4d348 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -62,7 +62,7 @@ static vm_fault_t udf_page_mkwrite(struct vm_fault *vmf) end = size & ~PAGE_MASK; else end = PAGE_SIZE; - err = __block_write_begin(&folio->page, 0, end, udf_get_block); + err = __block_write_begin(folio, 0, end, udf_get_block); if (err) { folio_unlock(folio); ret = vmf_fs_error(err); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index f43461652d9f..5331ae7ebf3e 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -481,7 +481,7 @@ static int ufs_read_folio(struct file *file, struct folio *folio) int ufs_prepare_chunk(struct folio *folio, loff_t pos, unsigned len) { - return __block_write_begin(&folio->page, pos, len, ufs_getfrag_block); + return __block_write_begin(folio, pos, len, ufs_getfrag_block); } static void ufs_truncate_blocks(struct inode *); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 254563a2a9de..c85e65fd4cdc 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -259,7 +259,7 @@ int block_read_full_folio(struct folio *, get_block_t *); bool block_is_partially_uptodate(struct folio *, size_t from, size_t count); int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, get_block_t *get_block); -int __block_write_begin(struct page *page, loff_t pos, unsigned len, +int __block_write_begin(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block); int block_write_end(struct file *, struct address_space *, loff_t, unsigned len, unsigned copied, From 7b9d14af8777ac439bbfa9ac73a12a6d85289e7e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 19 Jul 2024 13:41:49 +0200 Subject: [PATCH 056/168] fs: allow mount namespace fd We already allow a mount namespace id, enable mount namespace file descriptors as well. Link: https://lore.kernel.org/r/20240719-work-mount-namespace-v1-2-834113cab0d2@kernel.org Reviewed-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/namespace.c | 39 ++++++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 221db9de4729..1a0b9e3089a2 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5243,12 +5243,37 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req, * that, or if not simply grab a passive reference on our mount namespace and * return that. */ -static struct mnt_namespace *grab_requested_mnt_ns(u64 mnt_ns_id) +static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq) { - if (mnt_ns_id) - return lookup_mnt_ns(mnt_ns_id); - refcount_inc(¤t->nsproxy->mnt_ns->passive); - return current->nsproxy->mnt_ns; + struct mnt_namespace *mnt_ns; + + if (kreq->mnt_ns_id && kreq->spare) + return ERR_PTR(-EINVAL); + + if (kreq->mnt_ns_id) + return lookup_mnt_ns(kreq->mnt_ns_id); + + if (kreq->spare) { + struct ns_common *ns; + + CLASS(fd, f)(kreq->spare); + if (!f.file) + return ERR_PTR(-EBADF); + + if (!proc_ns_file(f.file)) + return ERR_PTR(-EINVAL); + + ns = get_proc_ns(file_inode(f.file)); + if (ns->ops->type != CLONE_NEWNS) + return ERR_PTR(-EINVAL); + + mnt_ns = to_mnt_ns(ns); + } else { + mnt_ns = current->nsproxy->mnt_ns; + } + + refcount_inc(&mnt_ns->passive); + return mnt_ns; } SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, @@ -5269,7 +5294,7 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, if (ret) return ret; - ns = grab_requested_mnt_ns(kreq.mnt_ns_id); + ns = grab_requested_mnt_ns(&kreq); if (!ns) return -ENOENT; @@ -5396,7 +5421,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, if (!kmnt_ids) return -ENOMEM; - ns = grab_requested_mnt_ns(kreq.mnt_ns_id); + ns = grab_requested_mnt_ns(&kreq); if (!ns) return -ENOENT; From 5fcf329676cf9d132842d9871f83a091c32f3bfc Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 19 Jul 2024 13:41:50 +0200 Subject: [PATCH 057/168] fs: add put_mnt_ns() cleanup helper Add a simple helper to put a mount namespace reference. Link: https://lore.kernel.org/r/20240719-work-mount-namespace-v1-3-834113cab0d2@kernel.org Reviewed-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/mnt_namespace.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index 8f882f5881e8..70b366b64816 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -3,6 +3,9 @@ #define _NAMESPACE_H_ #ifdef __KERNEL__ +#include +#include + struct mnt_namespace; struct fs_struct; struct user_namespace; @@ -11,6 +14,7 @@ struct ns_common; extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, struct user_namespace *, struct fs_struct *); extern void put_mnt_ns(struct mnt_namespace *ns); +DEFINE_FREE(put_mnt_ns, struct mnt_namespace *, if (!IS_ERR_OR_NULL(_T)) put_mnt_ns(_T)) extern struct ns_common *from_mnt_ns(struct mnt_namespace *); extern const struct file_operations proc_mounts_operations; From 257b1c2c78c25643526609dee0c15f1544eb3252 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 19 Jul 2024 13:41:51 +0200 Subject: [PATCH 058/168] file: add fput() cleanup helper Add a simple helper to put a file reference. Link: https://lore.kernel.org/r/20240719-work-mount-namespace-v1-4-834113cab0d2@kernel.org Reviewed-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/file.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/file.h b/include/linux/file.h index 237931f20739..d1e768b06069 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -11,6 +11,7 @@ #include #include #include +#include struct file; @@ -96,6 +97,7 @@ extern void put_unused_fd(unsigned int fd); DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T), get_unused_fd_flags(flags), unsigned flags) +DEFINE_FREE(fput, struct file *, if (!IS_ERR_OR_NULL(_T)) fput(_T)) /* * take_fd() will take care to set @fd to -EBADF ensuring that From a1d220d9dafa8d76ba60a784a1016c3134e6a1e8 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 19 Jul 2024 13:41:52 +0200 Subject: [PATCH 059/168] nsfs: iterate through mount namespaces It is already possible to list mounts in other mount namespaces and to retrieve namespace file descriptors without having to go through procfs by deriving them from pidfds. Augment these abilities by adding the ability to retrieve information about a mount namespace via NS_MNT_GET_INFO. This will return the mount namespace id and the number of mounts currently in the mount namespace. The number of mounts can be used to size the buffer that needs to be used for listmount() and is in general useful without having to actually iterate through all the mounts. The structure is extensible. And add the ability to iterate through all mount namespaces over which the caller holds privilege returning the file descriptor for the next or previous mount namespace. To retrieve a mount namespace the caller must be privileged wrt to it's owning user namespace. This means that PID 1 on the host can list all mounts in all mount namespaces or that a container can list all mounts of its nested containers. Optionally pass a structure for NS_MNT_GET_INFO with NS_MNT_GET_{PREV,NEXT} to retrieve information about the mount namespace in one go. Both ioctls can be implemented for other namespace types easily. Together with recent api additions this means one can iterate through all mounts in all mount namespaces without ever touching procfs. Link: https://lore.kernel.org/r/20240719-work-mount-namespace-v1-5-834113cab0d2@kernel.org Reviewed-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/mount.h | 13 +++++ fs/namespace.c | 37 ++++++++++++-- fs/nsfs.c | 102 +++++++++++++++++++++++++++++++++++++- include/uapi/linux/nsfs.h | 16 ++++++ 4 files changed, 161 insertions(+), 7 deletions(-) diff --git a/fs/mount.h b/fs/mount.h index ad4b1ddebb54..c1db0c709c6a 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -155,3 +155,16 @@ static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list) extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor); bool has_locked_children(struct mount *mnt, struct dentry *dentry); +struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mnt_ns, bool previous); +static inline struct mnt_namespace *lookup_next_mnt_ns(struct mnt_namespace *mntns) +{ + return __lookup_next_mnt_ns(mntns, false); +} +static inline struct mnt_namespace *lookup_prev_mnt_ns(struct mnt_namespace *mntns) +{ + return __lookup_next_mnt_ns(mntns, true); +} +static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns) +{ + return container_of(ns, struct mnt_namespace, ns); +} diff --git a/fs/namespace.c b/fs/namespace.c index 1a0b9e3089a2..3dabe49f7376 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2060,16 +2060,43 @@ static bool is_mnt_ns_file(struct dentry *dentry) dentry->d_fsdata == &mntns_operations; } -static struct mnt_namespace *to_mnt_ns(struct ns_common *ns) -{ - return container_of(ns, struct mnt_namespace, ns); -} - struct ns_common *from_mnt_ns(struct mnt_namespace *mnt) { return &mnt->ns; } +struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mntns, bool previous) +{ + guard(read_lock)(&mnt_ns_tree_lock); + for (;;) { + struct rb_node *node; + + if (previous) + node = rb_prev(&mntns->mnt_ns_tree_node); + else + node = rb_next(&mntns->mnt_ns_tree_node); + if (!node) + return ERR_PTR(-ENOENT); + + mntns = node_to_mnt_ns(node); + node = &mntns->mnt_ns_tree_node; + + if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN)) + continue; + + /* + * Holding mnt_ns_tree_lock prevents the mount namespace from + * being freed but it may well be on it's deathbed. We want an + * active reference, not just a passive one here as we're + * persisting the mount namespace. + */ + if (!refcount_inc_not_zero(&mntns->ns.count)) + continue; + + return mntns; + } +} + static bool mnt_ns_loop(struct dentry *dentry) { /* Could bind mounting the mount namespace inode cause a diff --git a/fs/nsfs.c b/fs/nsfs.c index 97c37a9631e5..67ee176b8824 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "mount.h" #include "internal.h" @@ -128,6 +129,30 @@ int open_related_ns(struct ns_common *ns, } EXPORT_SYMBOL_GPL(open_related_ns); +static int copy_ns_info_to_user(const struct mnt_namespace *mnt_ns, + struct mnt_ns_info __user *uinfo, size_t usize, + struct mnt_ns_info *kinfo) +{ + /* + * If userspace and the kernel have the same struct size it can just + * be copied. If userspace provides an older struct, only the bits that + * userspace knows about will be copied. If userspace provides a new + * struct, only the bits that the kernel knows aobut will be copied and + * the size value will be set to the size the kernel knows about. + */ + kinfo->size = min(usize, sizeof(*kinfo)); + kinfo->mnt_ns_id = mnt_ns->seq; + kinfo->nr_mounts = READ_ONCE(mnt_ns->nr_mounts); + /* Subtract the root mount of the mount namespace. */ + if (kinfo->nr_mounts) + kinfo->nr_mounts--; + + if (copy_to_user(uinfo, kinfo, kinfo->size)) + return -EFAULT; + + return 0; +} + static long ns_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -135,6 +160,8 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, struct pid_namespace *pid_ns; struct task_struct *tsk; struct ns_common *ns = get_proc_ns(file_inode(filp)); + struct mnt_namespace *mnt_ns; + bool previous = false; uid_t __user *argp; uid_t uid; int ret; @@ -156,7 +183,6 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, uid = from_kuid_munged(current_user_ns(), user_ns->owner); return put_user(uid, argp); case NS_GET_MNTNS_ID: { - struct mnt_namespace *mnt_ns; __u64 __user *idp; __u64 id; @@ -211,7 +237,79 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, if (!ret) ret = -ESRCH; - break; + return ret; + } + } + + /* extensible ioctls */ + switch (_IOC_NR(ioctl)) { + case _IOC_NR(NS_MNT_GET_INFO): { + struct mnt_ns_info kinfo = {}; + struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg; + size_t usize = _IOC_SIZE(ioctl); + + if (ns->ops->type != CLONE_NEWNS) + return -EINVAL; + + if (!uinfo) + return -EINVAL; + + if (usize < MNT_NS_INFO_SIZE_VER0) + return -EINVAL; + + return copy_ns_info_to_user(to_mnt_ns(ns), uinfo, usize, &kinfo); + } + case _IOC_NR(NS_MNT_GET_PREV): + previous = true; + fallthrough; + case _IOC_NR(NS_MNT_GET_NEXT): { + struct mnt_ns_info kinfo = {}; + struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg; + struct path path __free(path_put) = {}; + struct file *f __free(fput) = NULL; + size_t usize = _IOC_SIZE(ioctl); + + if (ns->ops->type != CLONE_NEWNS) + return -EINVAL; + + if (usize < MNT_NS_INFO_SIZE_VER0) + return -EINVAL; + + if (previous) + mnt_ns = lookup_prev_mnt_ns(to_mnt_ns(ns)); + else + mnt_ns = lookup_next_mnt_ns(to_mnt_ns(ns)); + if (IS_ERR(mnt_ns)) + return PTR_ERR(mnt_ns); + + ns = to_ns_common(mnt_ns); + /* Transfer ownership of @mnt_ns reference to @path. */ + ret = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path); + if (ret) + return ret; + + CLASS(get_unused_fd, fd)(O_CLOEXEC); + if (fd < 0) + return fd; + + f = dentry_open(&path, O_RDONLY, current_cred()); + if (IS_ERR(f)) + return PTR_ERR(f); + + if (uinfo) { + /* + * If @uinfo is passed return all information about the + * mount namespace as well. + */ + ret = copy_ns_info_to_user(to_mnt_ns(ns), uinfo, usize, &kinfo); + if (ret) + return ret; + } + + /* Transfer reference of @f to caller's fdtable. */ + fd_install(fd, no_free_ptr(f)); + /* File descriptor is live so hand it off to the caller. */ + return take_fd(fd); } default: ret = -ENOTTY; diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index b133211331f6..cb31af3c1840 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -3,6 +3,7 @@ #define __LINUX_NSFS_H #include +#include #define NSIO 0xb7 @@ -26,4 +27,19 @@ /* Return thread-group leader id of pid in the target pid namespace. */ #define NS_GET_TGID_IN_PIDNS _IOR(NSIO, 0x9, int) +struct mnt_ns_info { + __u32 size; + __u32 nr_mounts; + __u64 mnt_ns_id; +}; + +#define MNT_NS_INFO_SIZE_VER0 16 /* size of first published struct */ + +/* Get information about namespace. */ +#define NS_MNT_GET_INFO _IOR(NSIO, 10, struct mnt_ns_info) +/* Get next namespace. */ +#define NS_MNT_GET_NEXT _IOR(NSIO, 11, struct mnt_ns_info) +/* Get previous namespace. */ +#define NS_MNT_GET_PREV _IOR(NSIO, 12, struct mnt_ns_info) + #endif /* __LINUX_NSFS_H */ From a037d5e7f81bae8ff69eb670b2ec3f25ad4d2cc2 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 7 Aug 2023 15:38:40 -0400 Subject: [PATCH 060/168] fs: add infrastructure for multigrain timestamps The VFS has always used coarse-grained timestamps when updating the ctime and mtime after a change. This has the benefit of allowing filesystems to optimize away a lot metadata updates, down to around 1 per jiffy, even when a file is under heavy writes. Unfortunately, this has always been an issue when we're exporting via NFSv3, which relies on timestamps to validate caches. A lot of changes can happen in a jiffy, so timestamps aren't sufficient to help the client decide when to invalidate the cache. Even with NFSv4, a lot of exported filesystems don't properly support a change attribute and are subject to the same problems with timestamp granularity. Other applications have similar issues with timestamps (e.g backup applications). If we were to always use fine-grained timestamps, that would improve the situation, but that becomes rather expensive, as the underlying filesystem would have to log a lot more metadata updates. What we need is a way to only use fine-grained timestamps when they are being actively queried. Use the (unused) top bit in inode->i_ctime_nsec as a flag that indicates whether the current timestamps have been queried via stat() or the like. When it's set, we allow the kernel to use a fine-grained timestamp iff it's necessary to make the ctime show a different value. This solves the problem of being able to distinguish the timestamp between updates, but introduces a new problem: it's now possible for a file being changed to get a fine-grained timestamp. A file that is altered just a bit later can then get a coarse-grained one that appears older than the earlier fine-grained time. This violates timestamp ordering guarantees. To remedy this, keep a global monotonic atomic64_t value that acts as a timestamp floor. When we go to stamp a file, we first get the latter of the current floor value and the current coarse-grained time. If the inode ctime hasn't been queried then we just attempt to stamp it with that value. If it has been queried, then first see whether the current coarse time is later than the existing ctime. If it is, then we accept that value. If it isn't, then we get a fine-grained time and try to swap that into the global floor. Whether that succeeds or fails, we take the resulting floor time, convert it to realtime and try to swap that into the ctime. We take the result of the ctime swap whether it succeeds or fails, since either is just as valid. Filesystems can opt into this by setting the FS_MGTIME fstype flag. Others should be unaffected (other than being subject to the same floor value as multigrain filesystems). Reviewed-by: Darrick J. Wong Reviewed-by: Josef Bacik Reviewed-by: Jan Kara Signed-off-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/inode.c | 176 ++++++++++++++++++++++++++++++++++++++------- fs/stat.c | 39 +++++++++- include/linux/fs.h | 34 ++++++--- 3 files changed, 212 insertions(+), 37 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 86670941884b..79995b481ff4 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -60,6 +60,13 @@ static unsigned int i_hash_shift __ro_after_init; static struct hlist_head *inode_hashtable __ro_after_init; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); +/* + * This represents the latest fine-grained time that we have handed out as a + * timestamp on the system. Tracked as a monotonic value, and converted to the + * realtime clock on an as-needed basis. + */ +static __cacheline_aligned_in_smp atomic64_t ctime_floor; + /* * Empty aops. Can be used for the cases where the user does not * define any of the address_space operations. @@ -2137,19 +2144,72 @@ int file_remove_privs(struct file *file) } EXPORT_SYMBOL(file_remove_privs); +/** + * coarse_ctime - return the current coarse-grained time + * @floor: current (monotonic) ctime_floor value + * + * Get the coarse-grained time, and then determine whether to + * return it or the current floor value. Returns the later of the + * floor and coarse grained timestamps, converted to realtime + * clock value. + */ +static ktime_t coarse_ctime(ktime_t floor) +{ + ktime_t coarse = ktime_get_coarse(); + + /* If coarse time is already newer, return that */ + if (!ktime_after(floor, coarse)) + return ktime_get_coarse_real(); + return ktime_mono_to_real(floor); +} + +/** + * current_time - Return FS time (possibly fine-grained) + * @inode: inode. + * + * Return the current time truncated to the time granularity supported by + * the fs, as suitable for a ctime/mtime change. If the ctime is flagged + * as having been QUERIED, get a fine-grained timestamp. + */ +struct timespec64 current_time(struct inode *inode) +{ + ktime_t floor = atomic64_read(&ctime_floor); + ktime_t now = coarse_ctime(floor); + struct timespec64 now_ts = ktime_to_timespec64(now); + u32 cns; + + if (!is_mgtime(inode)) + goto out; + + /* If nothing has queried it, then coarse time is fine */ + cns = smp_load_acquire(&inode->i_ctime_nsec); + if (cns & I_CTIME_QUERIED) { + /* + * If there is no apparent change, then + * get a fine-grained timestamp. + */ + if (now_ts.tv_nsec == (cns & ~I_CTIME_QUERIED)) + ktime_get_real_ts64(&now_ts); + } +out: + return timestamp_truncate(now_ts, inode); +} +EXPORT_SYMBOL(current_time); + static int inode_needs_update_time(struct inode *inode) { + struct timespec64 now, ts; int sync_it = 0; - struct timespec64 now = current_time(inode); - struct timespec64 ts; /* First try to exhaust all avenues to not sync */ if (IS_NOCMTIME(inode)) return 0; + now = current_time(inode); + ts = inode_get_mtime(inode); if (!timespec64_equal(&ts, &now)) - sync_it = S_MTIME; + sync_it |= S_MTIME; ts = inode_get_ctime(inode); if (!timespec64_equal(&ts, &now)) @@ -2527,6 +2587,15 @@ void inode_nohighmem(struct inode *inode) } EXPORT_SYMBOL(inode_nohighmem); +struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts) +{ + set_normalized_timespec64(&ts, ts.tv_sec, ts.tv_nsec); + inode->i_ctime_sec = ts.tv_sec; + inode->i_ctime_nsec = ts.tv_nsec; + return ts; +} +EXPORT_SYMBOL(inode_set_ctime_to_ts); + /** * timestamp_truncate - Truncate timespec to a granularity * @t: Timespec @@ -2558,38 +2627,91 @@ struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode) } EXPORT_SYMBOL(timestamp_truncate); -/** - * current_time - Return FS time - * @inode: inode. - * - * Return the current time truncated to the time granularity supported by - * the fs. - * - * Note that inode and inode->sb cannot be NULL. - * Otherwise, the function warns and returns time without truncation. - */ -struct timespec64 current_time(struct inode *inode) -{ - struct timespec64 now; - - ktime_get_coarse_real_ts64(&now); - return timestamp_truncate(now, inode); -} -EXPORT_SYMBOL(current_time); - /** * inode_set_ctime_current - set the ctime to current_time * @inode: inode * - * Set the inode->i_ctime to the current value for the inode. Returns - * the current value that was assigned to i_ctime. + * Set the inode's ctime to the current value for the inode. Returns the + * current value that was assigned. If this is not a multigrain inode, then we + * just set it to whatever the coarse_ctime is. + * + * If it is multigrain, then we first see if the coarse-grained timestamp is + * distinct from what we have. If so, then we'll just use that. If we have to + * get a fine-grained timestamp, then do so, and try to swap it into the floor. + * We accept the new floor value regardless of the outcome of the cmpxchg. + * After that, we try to swap the new value into i_ctime_nsec. Again, we take + * the resulting ctime, regardless of the outcome of the swap. */ struct timespec64 inode_set_ctime_current(struct inode *inode) { - struct timespec64 now = current_time(inode); + ktime_t now, floor = atomic64_read(&ctime_floor); + struct timespec64 now_ts; + u32 cns, cur; - inode_set_ctime_to_ts(inode, now); - return now; + now = coarse_ctime(floor); + + /* Just return that if this is not a multigrain fs */ + if (!is_mgtime(inode)) { + now_ts = timestamp_truncate(ktime_to_timespec64(now), inode); + inode_set_ctime_to_ts(inode, now_ts); + goto out; + } + + /* + * We only need a fine-grained time if someone has queried it, + * and the current coarse grained time isn't later than what's + * already there. + */ + cns = smp_load_acquire(&inode->i_ctime_nsec); + if (cns & I_CTIME_QUERIED) { + ktime_t ctime = ktime_set(inode->i_ctime_sec, cns & ~I_CTIME_QUERIED); + + if (!ktime_after(now, ctime)) { + ktime_t old, fine; + + /* Get a fine-grained time */ + fine = ktime_get(); + + /* + * If the cmpxchg works, we take the new floor value. If + * not, then that means that someone else changed it after we + * fetched it but before we got here. That value is just + * as good, so keep it. + */ + old = floor; + if (!atomic64_try_cmpxchg(&ctime_floor, &old, fine)) + fine = old; + now = ktime_mono_to_real(fine); + } + } + now_ts = timestamp_truncate(ktime_to_timespec64(now), inode); + cur = cns; + + /* No need to cmpxchg if it's exactly the same */ + if (cns == now_ts.tv_nsec && inode->i_ctime_sec == now_ts.tv_sec) + goto out; +retry: + /* Try to swap the nsec value into place. */ + if (try_cmpxchg(&inode->i_ctime_nsec, &cur, now_ts.tv_nsec)) { + /* If swap occurred, then we're (mostly) done */ + inode->i_ctime_sec = now_ts.tv_sec; + } else { + /* + * Was the change due to someone marking the old ctime QUERIED? + * If so then retry the swap. This can only happen once since + * the only way to clear I_CTIME_QUERIED is to stamp the inode + * with a new ctime. + */ + if (!(cns & I_CTIME_QUERIED) && (cns | I_CTIME_QUERIED) == cur) { + cns = cur; + goto retry; + } + /* Otherwise, keep the existing ctime */ + now_ts.tv_sec = inode->i_ctime_sec; + now_ts.tv_nsec = cur & ~I_CTIME_QUERIED; + } +out: + return now_ts; } EXPORT_SYMBOL(inode_set_ctime_current); diff --git a/fs/stat.c b/fs/stat.c index 89ce1be56310..a449626fd460 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -26,6 +26,35 @@ #include "internal.h" #include "mount.h" +/** + * fill_mg_cmtime - Fill in the mtime and ctime and flag ctime as QUERIED + * @stat: where to store the resulting values + * @request_mask: STATX_* values requested + * @inode: inode from which to grab the c/mtime + * + * Given @inode, grab the ctime and mtime out if it and store the result + * in @stat. When fetching the value, flag it as QUERIED (if not already) + * so the next write will record a distinct timestamp. + */ +void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode) +{ + atomic_t *pcn = (atomic_t *)&inode->i_ctime_nsec; + + /* If neither time was requested, then don't report them */ + if (!(request_mask & (STATX_CTIME|STATX_MTIME))) { + stat->result_mask &= ~(STATX_CTIME|STATX_MTIME); + return; + } + + stat->mtime = inode_get_mtime(inode); + stat->ctime.tv_sec = inode->i_ctime_sec; + stat->ctime.tv_nsec = (u32)atomic_read(pcn); + if (!(stat->ctime.tv_nsec & I_CTIME_QUERIED)) + stat->ctime.tv_nsec = ((u32)atomic_fetch_or(I_CTIME_QUERIED, pcn)); + stat->ctime.tv_nsec &= ~I_CTIME_QUERIED; +} +EXPORT_SYMBOL(fill_mg_cmtime); + /** * generic_fillattr - Fill in the basic attributes from the inode struct * @idmap: idmap of the mount the inode was found from @@ -58,8 +87,14 @@ void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask, stat->rdev = inode->i_rdev; stat->size = i_size_read(inode); stat->atime = inode_get_atime(inode); - stat->mtime = inode_get_mtime(inode); - stat->ctime = inode_get_ctime(inode); + + if (is_mgtime(inode)) { + fill_mg_cmtime(stat, request_mask, inode); + } else { + stat->ctime = inode_get_ctime(inode); + stat->mtime = inode_get_mtime(inode); + } + stat->blksize = i_blocksize(inode); stat->blocks = inode->i_blocks; diff --git a/include/linux/fs.h b/include/linux/fs.h index fd34b5755c0b..f32cdc8ba0eb 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1612,6 +1612,17 @@ static inline struct timespec64 inode_set_mtime(struct inode *inode, return inode_set_mtime_to_ts(inode, ts); } +/* + * Multigrain timestamps + * + * Conditionally use fine-grained ctime and mtime timestamps when there + * are users actively observing them via getattr. The primary use-case + * for this is NFS clients that use the ctime to distinguish between + * different states of the file, and that are often fooled by multiple + * operations that occur in the same coarse-grained timer tick. + */ +#define I_CTIME_QUERIED ((u32)BIT(31)) + static inline time64_t inode_get_ctime_sec(const struct inode *inode) { return inode->i_ctime_sec; @@ -1619,7 +1630,7 @@ static inline time64_t inode_get_ctime_sec(const struct inode *inode) static inline long inode_get_ctime_nsec(const struct inode *inode) { - return inode->i_ctime_nsec; + return inode->i_ctime_nsec & ~I_CTIME_QUERIED; } static inline struct timespec64 inode_get_ctime(const struct inode *inode) @@ -1630,13 +1641,7 @@ static inline struct timespec64 inode_get_ctime(const struct inode *inode) return ts; } -static inline struct timespec64 inode_set_ctime_to_ts(struct inode *inode, - struct timespec64 ts) -{ - inode->i_ctime_sec = ts.tv_sec; - inode->i_ctime_nsec = ts.tv_nsec; - return ts; -} +struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts); /** * inode_set_ctime - set the ctime in the inode @@ -2494,6 +2499,7 @@ struct file_system_type { #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ #define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */ #define FS_ALLOW_IDMAP 32 /* FS has been updated to handle vfs idmappings. */ +#define FS_MGTIME 64 /* FS uses multigrain timestamps */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; @@ -2517,6 +2523,17 @@ struct file_system_type { #define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME) +/** + * is_mgtime: is this inode using multigrain timestamps + * @inode: inode to test for multigrain timestamps + * + * Return true if the inode uses multigrain timestamps, false otherwise. + */ +static inline bool is_mgtime(const struct inode *inode) +{ + return inode->i_sb->s_type->fs_flags & FS_MGTIME; +} + extern struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)); @@ -3256,6 +3273,7 @@ extern void page_put_link(void *); extern int page_symlink(struct inode *inode, const char *symname, int len); extern const struct inode_operations page_symlink_inode_operations; extern void kfree_link(void *); +void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode); void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *); void generic_fill_statx_attr(struct inode *inode, struct kstat *stat); void generic_fill_statx_atomic_writes(struct kstat *stat, From 6147cbda93bac1fd347d7ec24256c54d0e6ae274 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 28 Jun 2024 10:57:26 -0400 Subject: [PATCH 061/168] fs: tracepoints around multigrain timestamp events Add some tracepoints around various multigrain timestamp events. Reviewed-by: Josef Bacik Reviewed-by: Darrick J. Wong Reviewed-by: Jan Kara Signed-off-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/inode.c | 9 ++- fs/stat.c | 3 + include/trace/events/timestamp.h | 124 +++++++++++++++++++++++++++++++ 3 files changed, 135 insertions(+), 1 deletion(-) create mode 100644 include/trace/events/timestamp.h diff --git a/fs/inode.c b/fs/inode.c index 79995b481ff4..78d4f87b22f8 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -22,6 +22,9 @@ #include #include #include +#define CREATE_TRACE_POINTS +#include + #include "internal.h" /* @@ -2589,6 +2592,7 @@ EXPORT_SYMBOL(inode_nohighmem); struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts) { + trace_inode_set_ctime_to_ts(inode, &ts); set_normalized_timespec64(&ts, ts.tv_sec, ts.tv_nsec); inode->i_ctime_sec = ts.tv_sec; inode->i_ctime_nsec = ts.tv_nsec; @@ -2688,13 +2692,16 @@ struct timespec64 inode_set_ctime_current(struct inode *inode) cur = cns; /* No need to cmpxchg if it's exactly the same */ - if (cns == now_ts.tv_nsec && inode->i_ctime_sec == now_ts.tv_sec) + if (cns == now_ts.tv_nsec && inode->i_ctime_sec == now_ts.tv_sec) { + trace_ctime_xchg_skip(inode, &now_ts); goto out; + } retry: /* Try to swap the nsec value into place. */ if (try_cmpxchg(&inode->i_ctime_nsec, &cur, now_ts.tv_nsec)) { /* If swap occurred, then we're (mostly) done */ inode->i_ctime_sec = now_ts.tv_sec; + trace_ctime_ns_xchg(inode, cns, now_ts.tv_nsec, cur); } else { /* * Was the change due to someone marking the old ctime QUERIED? diff --git a/fs/stat.c b/fs/stat.c index a449626fd460..9eb6d9b2d010 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -23,6 +23,8 @@ #include #include +#include + #include "internal.h" #include "mount.h" @@ -52,6 +54,7 @@ void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode) if (!(stat->ctime.tv_nsec & I_CTIME_QUERIED)) stat->ctime.tv_nsec = ((u32)atomic_fetch_or(I_CTIME_QUERIED, pcn)); stat->ctime.tv_nsec &= ~I_CTIME_QUERIED; + trace_fill_mg_cmtime(inode, &stat->ctime, &stat->mtime); } EXPORT_SYMBOL(fill_mg_cmtime); diff --git a/include/trace/events/timestamp.h b/include/trace/events/timestamp.h new file mode 100644 index 000000000000..c9e5ec930054 --- /dev/null +++ b/include/trace/events/timestamp.h @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM timestamp + +#if !defined(_TRACE_TIMESTAMP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_TIMESTAMP_H + +#include +#include + +#define CTIME_QUERIED_FLAGS \ + { I_CTIME_QUERIED, "Q" } + +DECLARE_EVENT_CLASS(ctime, + TP_PROTO(struct inode *inode, + struct timespec64 *ctime), + + TP_ARGS(inode, ctime), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(time64_t, ctime_s) + __field(u32, ctime_ns) + __field(u32, gen) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->gen = inode->i_generation; + __entry->ctime_s = ctime->tv_sec; + __entry->ctime_ns = ctime->tv_nsec; + ), + + TP_printk("ino=%d:%d:%ld:%u ctime=%lld.%u", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen, + __entry->ctime_s, __entry->ctime_ns + ) +); + +DEFINE_EVENT(ctime, inode_set_ctime_to_ts, + TP_PROTO(struct inode *inode, + struct timespec64 *ctime), + TP_ARGS(inode, ctime)); + +DEFINE_EVENT(ctime, ctime_xchg_skip, + TP_PROTO(struct inode *inode, + struct timespec64 *ctime), + TP_ARGS(inode, ctime)); + +TRACE_EVENT(ctime_ns_xchg, + TP_PROTO(struct inode *inode, + u32 old, + u32 new, + u32 cur), + + TP_ARGS(inode, old, new, cur), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(u32, gen) + __field(u32, old) + __field(u32, new) + __field(u32, cur) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->gen = inode->i_generation; + __entry->old = old; + __entry->new = new; + __entry->cur = cur; + ), + + TP_printk("ino=%d:%d:%ld:%u old=%u:%s new=%u cur=%u:%s", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen, + __entry->old & ~I_CTIME_QUERIED, + __print_flags(__entry->old & I_CTIME_QUERIED, "|", CTIME_QUERIED_FLAGS), + __entry->new, + __entry->cur & ~I_CTIME_QUERIED, + __print_flags(__entry->cur & I_CTIME_QUERIED, "|", CTIME_QUERIED_FLAGS) + ) +); + +TRACE_EVENT(fill_mg_cmtime, + TP_PROTO(struct inode *inode, + struct timespec64 *ctime, + struct timespec64 *mtime), + + TP_ARGS(inode, ctime, mtime), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(time64_t, ctime_s) + __field(time64_t, mtime_s) + __field(u32, ctime_ns) + __field(u32, mtime_ns) + __field(u32, gen) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->gen = inode->i_generation; + __entry->ctime_s = ctime->tv_sec; + __entry->mtime_s = mtime->tv_sec; + __entry->ctime_ns = ctime->tv_nsec; + __entry->mtime_ns = mtime->tv_nsec; + ), + + TP_printk("ino=%d:%d:%ld:%u ctime=%lld.%u mtime=%lld.%u", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen, + __entry->ctime_s, __entry->ctime_ns, + __entry->mtime_s, __entry->mtime_ns + ) +); +#endif /* _TRACE_TIMESTAMP_H */ + +/* This part must be outside protection */ +#include From a777e231666a41cb3bb99ebcd04bee3defef7039 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 12 Jun 2024 14:42:34 -0400 Subject: [PATCH 062/168] fs: add percpu counters for significant multigrain timestamp events Four percpu counters for counting various stats around mgtimes, and a new debugfs file for displaying them, when CONFIG_DEBUG_FS is enabled: - number of attempted ctime updates - number of successful i_ctime_nsec swaps - number of fine-grained timestamp fetches - number of floor value swaps Reviewed-by: Josef Bacik Reviewed-by: Darrick J. Wong Reviewed-by: Jan Kara Signed-off-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/inode.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 1 deletion(-) diff --git a/fs/inode.c b/fs/inode.c index 78d4f87b22f8..a977f0d06002 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include #define CREATE_TRACE_POINTS #include @@ -108,6 +110,77 @@ long get_nr_dirty_inodes(void) return nr_dirty > 0 ? nr_dirty : 0; } +#ifdef CONFIG_DEBUG_FS +static DEFINE_PER_CPU(unsigned long, mg_ctime_updates); +static DEFINE_PER_CPU(unsigned long, mg_fine_stamps); +static DEFINE_PER_CPU(unsigned long, mg_floor_swaps); +static DEFINE_PER_CPU(unsigned long, mg_ctime_swaps); + +static long get_mg_ctime_updates(void) +{ + int i; + long sum = 0; + for_each_possible_cpu(i) + sum += per_cpu(mg_ctime_updates, i); + return sum < 0 ? 0 : sum; +} + +static long get_mg_fine_stamps(void) +{ + int i; + long sum = 0; + for_each_possible_cpu(i) + sum += per_cpu(mg_fine_stamps, i); + return sum < 0 ? 0 : sum; +} + +static long get_mg_floor_swaps(void) +{ + int i; + long sum = 0; + for_each_possible_cpu(i) + sum += per_cpu(mg_floor_swaps, i); + return sum < 0 ? 0 : sum; +} + +static long get_mg_ctime_swaps(void) +{ + int i; + long sum = 0; + for_each_possible_cpu(i) + sum += per_cpu(mg_ctime_swaps, i); + return sum < 0 ? 0 : sum; +} + +#define mgtime_counter_inc(__var) this_cpu_inc(__var) + +static int mgts_show(struct seq_file *s, void *p) +{ + long ctime_updates = get_mg_ctime_updates(); + long ctime_swaps = get_mg_ctime_swaps(); + long fine_stamps = get_mg_fine_stamps(); + long floor_swaps = get_mg_floor_swaps(); + + seq_printf(s, "%lu %lu %lu %lu\n", + ctime_updates, ctime_swaps, fine_stamps, floor_swaps); + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(mgts); + +static int __init mg_debugfs_init(void) +{ + debugfs_create_file("multigrain_timestamps", S_IFREG | S_IRUGO, NULL, NULL, &mgts_fops); + return 0; +} +late_initcall(mg_debugfs_init); + +#else /* ! CONFIG_DEBUG_FS */ + +#define mgtime_counter_inc(__var) do { } while (0) + +#endif /* CONFIG_DEBUG_FS */ + /* * Handle nr_inode sysctl */ @@ -2675,6 +2748,7 @@ struct timespec64 inode_set_ctime_current(struct inode *inode) /* Get a fine-grained time */ fine = ktime_get(); + mgtime_counter_inc(mg_fine_stamps); /* * If the cmpxchg works, we take the new floor value. If @@ -2683,11 +2757,14 @@ struct timespec64 inode_set_ctime_current(struct inode *inode) * as good, so keep it. */ old = floor; - if (!atomic64_try_cmpxchg(&ctime_floor, &old, fine)) + if (atomic64_try_cmpxchg(&ctime_floor, &old, fine)) + mgtime_counter_inc(mg_floor_swaps); + else fine = old; now = ktime_mono_to_real(fine); } } + mgtime_counter_inc(mg_ctime_updates); now_ts = timestamp_truncate(ktime_to_timespec64(now), inode); cur = cns; @@ -2702,6 +2779,7 @@ retry: /* If swap occurred, then we're (mostly) done */ inode->i_ctime_sec = now_ts.tv_sec; trace_ctime_ns_xchg(inode, cns, now_ts.tv_nsec, cur); + mgtime_counter_inc(mg_ctime_swaps); } else { /* * Was the change due to someone marking the old ctime QUERIED? From 3a5e76794b88f36d492d6c18058978c15171f941 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 30 Aug 2023 14:28:43 -0400 Subject: [PATCH 063/168] fs: have setattr_copy handle multigrain timestamps appropriately The setattr codepath is still using coarse-grained timestamps, even on multigrain filesystems. To fix this, we need to fetch the timestamp for ctime updates later, at the point where the assignment occurs in setattr_copy. On a multigrain inode, ignore the ia_ctime in the attrs, and always update the ctime to the current clock value. Update the atime and mtime with the same value (if needed) unless they are being set to other specific values, a'la utimes(). Note that we don't want to do this universally however, as some filesystems (e.g. most networked fs) want to do an explicit update elsewhere before updating the local inode. Reviewed-by: Darrick J. Wong Reviewed-by: Josef Bacik Reviewed-by: Jan Kara Signed-off-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/attr.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/fs/attr.c b/fs/attr.c index 825007d5cda4..e03ea6951864 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -271,6 +271,42 @@ out_big: } EXPORT_SYMBOL(inode_newsize_ok); +/** + * setattr_copy_mgtime - update timestamps for mgtime inodes + * @inode: inode timestamps to be updated + * @attr: attrs for the update + * + * With multigrain timestamps, we need to take more care to prevent races + * when updating the ctime. Always update the ctime to the very latest + * using the standard mechanism, and use that to populate the atime and + * mtime appropriately (unless we're setting those to specific values). + */ +static void setattr_copy_mgtime(struct inode *inode, const struct iattr *attr) +{ + unsigned int ia_valid = attr->ia_valid; + struct timespec64 now; + + /* + * If the ctime isn't being updated then nothing else should be + * either. + */ + if (!(ia_valid & ATTR_CTIME)) { + WARN_ON_ONCE(ia_valid & (ATTR_ATIME|ATTR_MTIME)); + return; + } + + now = inode_set_ctime_current(inode); + if (ia_valid & ATTR_ATIME_SET) + inode_set_atime_to_ts(inode, attr->ia_atime); + else if (ia_valid & ATTR_ATIME) + inode_set_atime_to_ts(inode, now); + + if (ia_valid & ATTR_MTIME_SET) + inode_set_mtime_to_ts(inode, attr->ia_mtime); + else if (ia_valid & ATTR_MTIME) + inode_set_mtime_to_ts(inode, now); +} + /** * setattr_copy - copy simple metadata updates into the generic inode * @idmap: idmap of the mount the inode was found from @@ -303,12 +339,6 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode, i_uid_update(idmap, attr, inode); i_gid_update(idmap, attr, inode); - if (ia_valid & ATTR_ATIME) - inode_set_atime_to_ts(inode, attr->ia_atime); - if (ia_valid & ATTR_MTIME) - inode_set_mtime_to_ts(inode, attr->ia_mtime); - if (ia_valid & ATTR_CTIME) - inode_set_ctime_to_ts(inode, attr->ia_ctime); if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; if (!in_group_or_capable(idmap, inode, @@ -316,6 +346,16 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode, mode &= ~S_ISGID; inode->i_mode = mode; } + + if (is_mgtime(inode)) + return setattr_copy_mgtime(inode, attr); + + if (ia_valid & ATTR_ATIME) + inode_set_atime_to_ts(inode, attr->ia_atime); + if (ia_valid & ATTR_MTIME) + inode_set_mtime_to_ts(inode, attr->ia_mtime); + if (ia_valid & ATTR_CTIME) + inode_set_ctime_to_ts(inode, attr->ia_ctime); } EXPORT_SYMBOL(setattr_copy); From 42ba4ae65752b8cb8b04430909cb66b28d51c94d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 28 Jun 2024 10:57:26 -0400 Subject: [PATCH 064/168] Documentation: add a new file documenting multigrain timestamps Add a high-level document that describes how multigrain timestamps work, rationale for them, and some info about implementation and tradeoffs. Reviewed-by: Josef Bacik Reviewed-by: Darrick J. Wong Reviewed-by: Randy Dunlap Reviewed-by: Jan Kara Signed-off-by: Jeff Layton Signed-off-by: Christian Brauner --- Documentation/filesystems/index.rst | 1 + Documentation/filesystems/multigrain-ts.rst | 121 ++++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 Documentation/filesystems/multigrain-ts.rst diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index e8e496d23e1d..44e9e77ffe0d 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -29,6 +29,7 @@ algorithms work. fiemap files locks + multigrain-ts mount_api quota seq_file diff --git a/Documentation/filesystems/multigrain-ts.rst b/Documentation/filesystems/multigrain-ts.rst new file mode 100644 index 000000000000..97877ab3d933 --- /dev/null +++ b/Documentation/filesystems/multigrain-ts.rst @@ -0,0 +1,121 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================== +Multigrain Timestamps +===================== + +Introduction +============ +Historically, the kernel has always used coarse time values to stamp inodes. +This value is updated every jiffy, so any change that happens within that jiffy +will end up with the same timestamp. + +When the kernel goes to stamp an inode (due to a read or write), it first gets +the current time and then compares it to the existing timestamp(s) to see +whether anything will change. If nothing changed, then it can avoid updating +the inode's metadata. + +Coarse timestamps are therefore good from a performance standpoint, since they +reduce the need for metadata updates, but bad from the standpoint of +determining whether anything has changed, since a lot of things can happen in a +jiffy. + +They are particularly troublesome with NFSv3, where unchanging timestamps can +make it difficult to tell whether to invalidate caches. NFSv4 provides a +dedicated change attribute that should always show a visible change, but not +all filesystems implement this properly, causing the NFS server to substitute +the ctime in many cases. + +Multigrain timestamps aim to remedy this by selectively using fine-grained +timestamps when a file has had its timestamps queried recently, and the current +coarse-grained time does not cause a change. + +Inode Timestamps +================ +There are currently 3 timestamps in the inode that are updated to the current +wallclock time on different activity: + +ctime: + The inode change time. This is stamped with the current time whenever + the inode's metadata is changed. Note that this value is not settable + from userland. + +mtime: + The inode modification time. This is stamped with the current time + any time a file's contents change. + +atime: + The inode access time. This is stamped whenever an inode's contents are + read. Widely considered to be a terrible mistake. Usually avoided with + options like noatime or relatime. + +Updating the mtime always implies a change to the ctime, but updating the +atime due to a read request does not. + +Multigrain timestamps are only tracked for the ctime and the mtime. atimes are +not affected and always use the coarse-grained value (subject to the floor). + +Inode Timestamp Ordering +======================== + +In addition to just providing info about changes to individual files, file +timestamps also serve an important purpose in applications like "make". These +programs measure timestamps in order to determine whether source files might be +newer than cached objects. + +Userland applications like make can only determine ordering based on +operational boundaries. For a syscall those are the syscall entry and exit +points. For io_uring or nfsd operations, that's the request submission and +response. In the case of concurrent operations, userland can make no +determination about the order in which things will occur. + +For instance, if a single thread modifies one file, and then another file in +sequence, the second file must show an equal or later mtime than the first. The +same is true if two threads are issuing similar operations that do not overlap +in time. + +If however, two threads have racing syscalls that overlap in time, then there +is no such guarantee, and the second file may appear to have been modified +before, after or at the same time as the first, regardless of which one was +submitted first. + +Multigrain Timestamp Implementation +=================================== +Multigrain timestamps are aimed at ensuring that changes to a single file are +always recognizable, without violating the ordering guarantees when multiple +different files are modified. This affects the mtime and the ctime, but the +atime will always use coarse-grained timestamps. + +It uses an unused bit in the i_ctime_nsec field to indicate whether the mtime +or ctime has been queried. If either or both have, then the kernel takes +special care to ensure the next timestamp update will display a visible change. +This ensures tight cache coherency for use-cases like NFS, without sacrificing +the benefits of reduced metadata updates when files aren't being watched. + +The Ctime Floor Value +===================== +It's not sufficient to simply use fine or coarse-grained timestamps based on +whether the mtime or ctime has been queried. A file could get a fine grained +timestamp, and then a second file modified later could get a coarse-grained one +that appears earlier than the first, which would break the kernel's timestamp +ordering guarantees. + +To mitigate this problem, we maintain a global floor value that ensures that +this can't happen. The two files in the above example may appear to have been +modified at the same time in such a case, but they will never show the reverse +order. To avoid problems with realtime clock jumps, the floor is managed as a +monotonic ktime_t, and the values are converted to realtime clock values as +needed. + +Implementation Notes +==================== +Multigrain timestamps are intended for use by local filesystems that get +ctime values from the local clock. This is in contrast to network filesystems +and the like that just mirror timestamp values from a server. + +For most filesystems, it's sufficient to just set the FS_MGTIME flag in the +fstype->fs_flags in order to opt-in, providing the ctime is only ever set via +inode_set_ctime_current(). If the filesystem has a ->getattr routine that +doesn't call generic_fillattr, then you should have it call fill_mg_cmtime to +fill those values. For setattr, it should use setattr_copy() to update the +timestamps, or otherwise mimic its behavior. From 3062a738d73c866bf50df13bc47a2223b7b47d87 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 7 Aug 2023 15:38:42 -0400 Subject: [PATCH 065/168] xfs: switch to multigrain timestamps Enable multigrain timestamps, which should ensure that there is an apparent change to the timestamp whenever it has been written after being actively observed via getattr. Also, anytime the mtime changes, the ctime must also change, and those are now the only two options for xfs_trans_ichgtime. Have that function unconditionally bump the ctime, and ASSERT that XFS_ICHGTIME_CHG is always set. Finally, stop setting STATX_CHANGE_COOKIE in getattr, since the ctime should give us better semantics now. Reviewed-by: Josef Bacik Reviewed-by: Darrick J. Wong Signed-off-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/xfs/libxfs/xfs_trans_inode.c | 6 +++--- fs/xfs/xfs_iops.c | 10 +++------- fs/xfs/xfs_super.c | 2 +- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index 3c40f37e82c7..c962ad64b0c1 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -62,12 +62,12 @@ xfs_trans_ichgtime( ASSERT(tp); xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); - tv = current_time(inode); + /* If the mtime changes, then ctime must also change */ + ASSERT(flags & XFS_ICHGTIME_CHG); + tv = inode_set_ctime_current(inode); if (flags & XFS_ICHGTIME_MOD) inode_set_mtime_to_ts(inode, tv); - if (flags & XFS_ICHGTIME_CHG) - inode_set_ctime_to_ts(inode, tv); if (flags & XFS_ICHGTIME_ACCESS) inode_set_atime_to_ts(inode, tv); if (flags & XFS_ICHGTIME_CREATE) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 1cdc8034f54d..a1c4a350a6db 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -597,8 +597,9 @@ xfs_vn_getattr( stat->gid = vfsgid_into_kgid(vfsgid); stat->ino = ip->i_ino; stat->atime = inode_get_atime(inode); - stat->mtime = inode_get_mtime(inode); - stat->ctime = inode_get_ctime(inode); + + fill_mg_cmtime(stat, request_mask, inode); + stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks); if (xfs_has_v3inodes(mp)) { @@ -608,11 +609,6 @@ xfs_vn_getattr( } } - if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) { - stat->change_cookie = inode_query_iversion(inode); - stat->result_mask |= STATX_CHANGE_COOKIE; - } - /* * Note: If you add another clause to set an attribute flag, please * update attributes_mask below. diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 27e9f749c4c7..210481b03fdb 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -2052,7 +2052,7 @@ static struct file_system_type xfs_fs_type = { .init_fs_context = xfs_init_fs_context, .parameters = xfs_fs_parameters, .kill_sb = xfs_kill_sb, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, }; MODULE_ALIAS_FS("xfs"); From 729f35ab8b0a5641b0d64c320cc77282a89ba2fc Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 7 Aug 2023 15:38:43 -0400 Subject: [PATCH 066/168] ext4: switch to multigrain timestamps Enable multigrain timestamps, which should ensure that there is an apparent change to the timestamp whenever it has been written after being actively observed via getattr. For ext4, we only need to enable the FS_MGTIME flag. Reviewed-by: Josef Bacik Reviewed-by: Jan Kara Signed-off-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e72145c4ae5a..a125d9435b8a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -7298,7 +7298,7 @@ static struct file_system_type ext4_fs_type = { .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, .kill_sb = ext4_kill_sb, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, }; MODULE_ALIAS_FS("ext4"); From c7e408a168b5ae6530685ad3bfbeec1755e83b53 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 7 Aug 2023 15:38:44 -0400 Subject: [PATCH 067/168] btrfs: convert to multigrain timestamps Enable multigrain timestamps, which should ensure that there is an apparent change to the timestamp whenever it has been written after being actively observed via getattr. Beyond enabling the FS_MGTIME flag, this patch eliminates update_time_for_write, which goes to great pains to avoid in-memory stores. Just have it overwrite the timestamps unconditionally. Note that this also drops the IS_I_VERSION check and unconditionally bumps the change attribute, since SB_I_VERSION is always set on btrfs. Reviewed-by: Josef Bacik Signed-off-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/btrfs/file.c | 25 ++++--------------------- fs/btrfs/super.c | 3 ++- 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 21381de906f6..493f224b1702 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1120,26 +1120,6 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode) btrfs_drew_write_unlock(&inode->root->snapshot_lock); } -static void update_time_for_write(struct inode *inode) -{ - struct timespec64 now, ts; - - if (IS_NOCMTIME(inode)) - return; - - now = current_time(inode); - ts = inode_get_mtime(inode); - if (!timespec64_equal(&ts, &now)) - inode_set_mtime_to_ts(inode, now); - - ts = inode_get_ctime(inode); - if (!timespec64_equal(&ts, &now)) - inode_set_ctime_to_ts(inode, now); - - if (IS_I_VERSION(inode)) - inode_inc_iversion(inode); -} - int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count) { struct file *file = iocb->ki_filp; @@ -1170,7 +1150,10 @@ int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count) * need to start yet another transaction to update the inode as we will * update the inode when we finish writing whatever data we write. */ - update_time_for_write(inode); + if (!IS_NOCMTIME(inode)) { + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + inode_inc_iversion(inode); + } start_pos = round_down(pos, fs_info->sectorsize); oldsize = i_size_read(inode); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 08d33cb372fb..e6424cf40798 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2194,7 +2194,8 @@ static struct file_system_type btrfs_fs_type = { .init_fs_context = btrfs_init_fs_context, .parameters = btrfs_fs_parameters, .kill_sb = btrfs_kill_super, - .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP, + .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | + FS_ALLOW_IDMAP | FS_MGTIME, }; MODULE_ALIAS_FS("btrfs"); From 72f170e6e3cf35f2fbfbdc893eb0467c9f8be737 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 7 Aug 2023 15:38:41 -0400 Subject: [PATCH 068/168] tmpfs: add support for multigrain timestamps Enable multigrain timestamps, which should ensure that there is an apparent change to the timestamp whenever it has been written after being actively observed via getattr. tmpfs only requires the FS_MGTIME flag. Reviewed-by: Josef Bacik Reviewed-by: Jan Kara Signed-off-by: Jeff Layton Signed-off-by: Christian Brauner --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 2faa9daaf54b..60aeec5ebffb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -4808,7 +4808,7 @@ static struct file_system_type shmem_fs_type = { .parameters = shmem_fs_parameters, #endif .kill_sb = kill_litter_super, - .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP, + .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP | FS_MGTIME, }; void __init shmem_init(void) From 6a7fb6ebe3718ca5f56468b163ff82c65bdf7b7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20G=C3=B6ttsche?= Date: Fri, 26 Apr 2024 18:20:14 +0200 Subject: [PATCH 069/168] fs/xattr: add *at family syscalls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the four syscalls setxattrat(), getxattrat(), listxattrat() and removexattrat(). Those can be used to operate on extended attributes, especially security related ones, either relative to a pinned directory or on a file descriptor without read access, avoiding a /proc//fd/ detour, requiring a mounted procfs. One use case will be setfiles(8) setting SELinux file contexts ("security.selinux") without race conditions and without a file descriptor opened with read access requiring SELinux read permission. Use the do_{name}at() pattern from fs/open.c. Pass the value of the extended attribute, its length, and for setxattrat(2) the command (XATTR_CREATE or XATTR_REPLACE) via an added struct xattr_args to not exceed six syscall arguments and not merging the AT_* and XATTR_* flags. Signed-off-by: Christian Göttsche Link: https://lore.kernel.org/r/20240426162042.191916-1-cgoettsche@seltendoof.de Reviewed-by: Arnd Bergmann CC: x86@kernel.org CC: linux-alpha@vger.kernel.org CC: linux-kernel@vger.kernel.org CC: linux-arm-kernel@lists.infradead.org CC: linux-ia64@vger.kernel.org CC: linux-m68k@lists.linux-m68k.org CC: linux-mips@vger.kernel.org CC: linux-parisc@vger.kernel.org CC: linuxppc-dev@lists.ozlabs.org CC: linux-s390@vger.kernel.org CC: linux-sh@vger.kernel.org CC: sparclinux@vger.kernel.org CC: linux-fsdevel@vger.kernel.org CC: audit@vger.kernel.org CC: linux-arch@vger.kernel.org CC: linux-api@vger.kernel.org CC: linux-security-module@vger.kernel.org CC: selinux@vger.kernel.org [brauner: slight tweaks] Signed-off-by: Christian Brauner --- arch/alpha/kernel/syscalls/syscall.tbl | 4 + arch/arm/tools/syscall.tbl | 4 + arch/m68k/kernel/syscalls/syscall.tbl | 4 + arch/microblaze/kernel/syscalls/syscall.tbl | 4 + arch/mips/kernel/syscalls/syscall_n32.tbl | 4 + arch/mips/kernel/syscalls/syscall_n64.tbl | 4 + arch/mips/kernel/syscalls/syscall_o32.tbl | 4 + arch/parisc/kernel/syscalls/syscall.tbl | 4 + arch/powerpc/kernel/syscalls/syscall.tbl | 4 + arch/s390/kernel/syscalls/syscall.tbl | 4 + arch/sh/kernel/syscalls/syscall.tbl | 4 + arch/sparc/kernel/syscalls/syscall.tbl | 4 + arch/x86/entry/syscalls/syscall_32.tbl | 4 + arch/x86/entry/syscalls/syscall_64.tbl | 4 + arch/xtensa/kernel/syscalls/syscall.tbl | 4 + fs/xattr.c | 153 ++++++++++++++++---- include/asm-generic/audit_change_attr.h | 6 + include/linux/syscalls.h | 13 ++ include/linux/xattr.h | 4 + include/uapi/asm-generic/unistd.h | 11 +- include/uapi/linux/xattr.h | 7 + 21 files changed, 225 insertions(+), 29 deletions(-) diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 74720667fe09..c59d53d6d3f3 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -502,3 +502,7 @@ 570 common lsm_set_self_attr sys_lsm_set_self_attr 571 common lsm_list_modules sys_lsm_list_modules 572 common mseal sys_mseal +573 common setxattrat sys_setxattrat +574 common getxattrat sys_getxattrat +575 common listxattrat sys_listxattrat +576 common removexattrat sys_removexattrat diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 23c98203c40f..49eeb2ad8dbd 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -477,3 +477,7 @@ 460 common lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules 462 common mseal sys_mseal +463 common setxattrat sys_setxattrat +464 common getxattrat sys_getxattrat +465 common listxattrat sys_listxattrat +466 common removexattrat sys_removexattrat diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 22a3cbd4c602..f5ed71f1910d 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -462,3 +462,7 @@ 460 common lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules 462 common mseal sys_mseal +463 common setxattrat sys_setxattrat +464 common getxattrat sys_getxattrat +465 common listxattrat sys_listxattrat +466 common removexattrat sys_removexattrat diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 2b81a6bd78b2..680f568b77f2 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -468,3 +468,7 @@ 460 common lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules 462 common mseal sys_mseal +463 common setxattrat sys_setxattrat +464 common getxattrat sys_getxattrat +465 common listxattrat sys_listxattrat +466 common removexattrat sys_removexattrat diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 953f5b7dc723..0b9b7e25b69a 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -401,3 +401,7 @@ 460 n32 lsm_set_self_attr sys_lsm_set_self_attr 461 n32 lsm_list_modules sys_lsm_list_modules 462 n32 mseal sys_mseal +463 n32 setxattrat sys_setxattrat +464 n32 getxattrat sys_getxattrat +465 n32 listxattrat sys_listxattrat +466 n32 removexattrat sys_removexattrat diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index 1464c6be6eb3..c844cd5cda62 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -377,3 +377,7 @@ 460 n64 lsm_set_self_attr sys_lsm_set_self_attr 461 n64 lsm_list_modules sys_lsm_list_modules 462 n64 mseal sys_mseal +463 n64 setxattrat sys_setxattrat +464 n64 getxattrat sys_getxattrat +465 n64 listxattrat sys_listxattrat +466 n64 removexattrat sys_removexattrat diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 2439a2491cff..349b8aad1159 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -450,3 +450,7 @@ 460 o32 lsm_set_self_attr sys_lsm_set_self_attr 461 o32 lsm_list_modules sys_lsm_list_modules 462 o32 mseal sys_mseal +463 o32 setxattrat sys_setxattrat +464 o32 getxattrat sys_getxattrat +465 o32 listxattrat sys_listxattrat +466 o32 removexattrat sys_removexattrat diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 66dc406b12e4..d9fc94c86965 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -461,3 +461,7 @@ 460 common lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules 462 common mseal sys_mseal +463 common setxattrat sys_setxattrat +464 common getxattrat sys_getxattrat +465 common listxattrat sys_listxattrat +466 common removexattrat sys_removexattrat diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index ebae8415dfbb..d8b4ab78bef0 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -553,3 +553,7 @@ 460 common lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules 462 common mseal sys_mseal +463 common setxattrat sys_setxattrat +464 common getxattrat sys_getxattrat +465 common listxattrat sys_listxattrat +466 common removexattrat sys_removexattrat diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 01071182763e..e9115b4d8b63 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -465,3 +465,7 @@ 460 common lsm_set_self_attr sys_lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules sys_lsm_list_modules 462 common mseal sys_mseal sys_mseal +463 common setxattrat sys_setxattrat sys_setxattrat +464 common getxattrat sys_getxattrat sys_getxattrat +465 common listxattrat sys_listxattrat sys_listxattrat +466 common removexattrat sys_removexattrat sys_removexattrat diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index c55fd7696d40..c8cad33bf250 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -466,3 +466,7 @@ 460 common lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules 462 common mseal sys_mseal +463 common setxattrat sys_setxattrat +464 common getxattrat sys_getxattrat +465 common listxattrat sys_listxattrat +466 common removexattrat sys_removexattrat diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index cfdfb3707c16..727f99d333b3 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -508,3 +508,7 @@ 460 common lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules 462 common mseal sys_mseal +463 common setxattrat sys_setxattrat +464 common getxattrat sys_getxattrat +465 common listxattrat sys_listxattrat +466 common removexattrat sys_removexattrat diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 534c74b14fab..4d0fb2fba7e2 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -468,3 +468,7 @@ 460 i386 lsm_set_self_attr sys_lsm_set_self_attr 461 i386 lsm_list_modules sys_lsm_list_modules 462 i386 mseal sys_mseal +463 i386 setxattrat sys_setxattrat +464 i386 getxattrat sys_getxattrat +465 i386 listxattrat sys_listxattrat +466 i386 removexattrat sys_removexattrat diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 7093ee21c0d1..5eb708bff1c7 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -386,6 +386,10 @@ 460 common lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules 462 common mseal sys_mseal +463 common setxattrat sys_setxattrat +464 common getxattrat sys_getxattrat +465 common listxattrat sys_listxattrat +466 common removexattrat sys_removexattrat # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 67083fc1b2f5..37effc1b134e 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -433,3 +433,7 @@ 460 common lsm_set_self_attr sys_lsm_set_self_attr 461 common lsm_list_modules sys_lsm_list_modules 462 common mseal sys_mseal +463 common setxattrat sys_setxattrat +464 common getxattrat sys_getxattrat +465 common listxattrat sys_listxattrat +466 common removexattrat sys_removexattrat diff --git a/fs/xattr.c b/fs/xattr.c index a90215c97c9f..a95e8aecb7ab 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -630,27 +630,35 @@ int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, ctx->kvalue, ctx->size, ctx->flags); } -static int path_setxattr(const char __user *pathname, - const char __user *name, const void __user *value, - size_t size, int flags, unsigned int lookup_flags) +static int path_setxattrat(int dfd, const char __user *pathname, + unsigned int at_flags, const char __user *name, + const void __user *value, size_t size, int flags) { struct xattr_name kname; struct kernel_xattr_ctx ctx = { - .cvalue = value, - .kvalue = NULL, - .size = size, - .kname = &kname, - .flags = flags, + .cvalue = value, + .kvalue = NULL, + .size = size, + .kname = &kname, + .flags = flags, }; struct path path; + unsigned int lookup_flags = 0; int error; + if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) + return -EINVAL; + + lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; + if (at_flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + error = setxattr_copy(name, &ctx); if (error) return error; retry: - error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); + error = user_path_at(dfd, pathname, lookup_flags, &path); if (error) goto out; error = mnt_want_write(path.mnt); @@ -669,25 +677,50 @@ out: return error; } +SYSCALL_DEFINE6(setxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags, + const char __user *, name, const struct xattr_args __user *, uargs, + size_t, usize) +{ + struct xattr_args args = {}; + int error; + + BUILD_BUG_ON(sizeof(struct xattr_args) < XATTR_ARGS_SIZE_VER0); + BUILD_BUG_ON(sizeof(struct xattr_args) != XATTR_ARGS_SIZE_LATEST); + + if (unlikely(usize < XATTR_ARGS_SIZE_VER0)) + return -EINVAL; + if (usize > PAGE_SIZE) + return -E2BIG; + + error = copy_struct_from_user(&args, sizeof(args), uargs, usize); + if (error) + return error; + + return path_setxattrat(dfd, pathname, at_flags, name, + u64_to_user_ptr(args.value), args.size, + args.flags); +} + SYSCALL_DEFINE5(setxattr, const char __user *, pathname, const char __user *, name, const void __user *, value, size_t, size, int, flags) { - return path_setxattr(pathname, name, value, size, flags, LOOKUP_FOLLOW); + return path_setxattrat(AT_FDCWD, pathname, 0, name, value, size, flags); } SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname, const char __user *, name, const void __user *, value, size_t, size, int, flags) { - return path_setxattr(pathname, name, value, size, flags, 0); + return path_setxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name, + value, size, flags); } SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, const void __user *,value, size_t, size, int, flags) { struct xattr_name kname; - struct xattr_ctx ctx = { + struct kernel_xattr_ctx ctx = { .cvalue = value, .kvalue = NULL, .size = size, @@ -775,14 +808,22 @@ getxattr(struct mnt_idmap *idmap, struct dentry *d, return error; } -static ssize_t path_getxattr(const char __user *pathname, - const char __user *name, void __user *value, - size_t size, unsigned int lookup_flags) +static ssize_t path_getxattrat(int dfd, const char __user *pathname, + unsigned int at_flags, const char __user *name, + void __user *value, size_t size) { struct path path; ssize_t error; + int lookup_flags; + + if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) + return -EINVAL; + + lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; + if (at_flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; retry: - error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); + error = user_path_at(dfd, pathname, lookup_flags, &path); if (error) return error; error = getxattr(mnt_idmap(path.mnt), path.dentry, name, value, size); @@ -794,16 +835,42 @@ retry: return error; } +SYSCALL_DEFINE6(getxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags, + const char __user *, name, struct xattr_args __user *, uargs, size_t, usize) +{ + struct xattr_args args = {}; + int error; + + BUILD_BUG_ON(sizeof(struct xattr_args) < XATTR_ARGS_SIZE_VER0); + BUILD_BUG_ON(sizeof(struct xattr_args) != XATTR_ARGS_SIZE_LATEST); + + if (unlikely(usize < XATTR_ARGS_SIZE_VER0)) + return -EINVAL; + if (usize > PAGE_SIZE) + return -E2BIG; + + error = copy_struct_from_user(&args, sizeof(args), uargs, usize); + if (error) + return error; + + if (args.flags != 0) + return -EINVAL; + + return path_getxattrat(dfd, pathname, at_flags, name, + u64_to_user_ptr(args.value), args.size); +} + SYSCALL_DEFINE4(getxattr, const char __user *, pathname, const char __user *, name, void __user *, value, size_t, size) { - return path_getxattr(pathname, name, value, size, LOOKUP_FOLLOW); + return path_getxattrat(AT_FDCWD, pathname, 0, name, value, size); } SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname, const char __user *, name, void __user *, value, size_t, size) { - return path_getxattr(pathname, name, value, size, 0); + return path_getxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name, + value, size); } SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name, @@ -853,13 +920,22 @@ listxattr(struct dentry *d, char __user *list, size_t size) return error; } -static ssize_t path_listxattr(const char __user *pathname, char __user *list, - size_t size, unsigned int lookup_flags) +static ssize_t path_listxattrat(int dfd, const char __user *pathname, + unsigned int at_flags, char __user *list, + size_t size) { struct path path; ssize_t error; + int lookup_flags; + + if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) + return -EINVAL; + + lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; + if (at_flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; retry: - error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); + error = user_path_at(dfd, pathname, lookup_flags, &path); if (error) return error; error = listxattr(path.dentry, list, size); @@ -871,16 +947,23 @@ retry: return error; } +SYSCALL_DEFINE5(listxattrat, int, dfd, const char __user *, pathname, + unsigned int, at_flags, + char __user *, list, size_t, size) +{ + return path_listxattrat(dfd, pathname, at_flags, list, size); +} + SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list, size_t, size) { - return path_listxattr(pathname, list, size, LOOKUP_FOLLOW); + return path_listxattrat(AT_FDCWD, pathname, 0, list, size); } SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list, size_t, size) { - return path_listxattr(pathname, list, size, 0); + return path_listxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, list, size); } SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size) @@ -907,12 +990,20 @@ removexattr(struct mnt_idmap *idmap, struct dentry *d, const char *name) return vfs_removexattr(idmap, d, name); } -static int path_removexattr(const char __user *pathname, - const char __user *name, unsigned int lookup_flags) +static int path_removexattrat(int dfd, const char __user *pathname, + unsigned int at_flags, const char __user *name) { struct path path; int error; char kname[XATTR_NAME_MAX + 1]; + unsigned int lookup_flags = 0; + + if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) + return -EINVAL; + + lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; + if (at_flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; error = strncpy_from_user(kname, name, sizeof(kname)); if (error == 0 || error == sizeof(kname)) @@ -920,7 +1011,7 @@ static int path_removexattr(const char __user *pathname, if (error < 0) return error; retry: - error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); + error = user_path_at(dfd, pathname, lookup_flags, &path); if (error) return error; error = mnt_want_write(path.mnt); @@ -936,16 +1027,22 @@ retry: return error; } +SYSCALL_DEFINE4(removexattrat, int, dfd, const char __user *, pathname, + unsigned int, at_flags, const char __user *, name) +{ + return path_removexattrat(dfd, pathname, at_flags, name); +} + SYSCALL_DEFINE2(removexattr, const char __user *, pathname, const char __user *, name) { - return path_removexattr(pathname, name, LOOKUP_FOLLOW); + return path_removexattrat(AT_FDCWD, pathname, 0, name); } SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname, const char __user *, name) { - return path_removexattr(pathname, name, 0); + return path_removexattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name); } SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) diff --git a/include/asm-generic/audit_change_attr.h b/include/asm-generic/audit_change_attr.h index 331670807cf0..cc840537885f 100644 --- a/include/asm-generic/audit_change_attr.h +++ b/include/asm-generic/audit_change_attr.h @@ -11,9 +11,15 @@ __NR_lchown, __NR_fchown, #endif __NR_setxattr, +#ifdef __NR_setxattrat +__NR_setxattrat, +#endif __NR_lsetxattr, __NR_fsetxattr, __NR_removexattr, +#ifdef __NR_removexattrat +__NR_removexattrat, +#endif __NR_lremovexattr, __NR_fremovexattr, #ifdef __NR_fchownat diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 4bcf6754738d..548c24269d47 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -77,6 +77,7 @@ struct cachestat_range; struct cachestat; struct statmount; struct mnt_id_req; +struct xattr_args; #include #include @@ -338,23 +339,35 @@ asmlinkage long sys_io_uring_register(unsigned int fd, unsigned int op, void __user *arg, unsigned int nr_args); asmlinkage long sys_setxattr(const char __user *path, const char __user *name, const void __user *value, size_t size, int flags); +asmlinkage long sys_setxattrat(int dfd, const char __user *path, unsigned int at_flags, + const char __user *name, + const struct xattr_args __user *args, size_t size); asmlinkage long sys_lsetxattr(const char __user *path, const char __user *name, const void __user *value, size_t size, int flags); asmlinkage long sys_fsetxattr(int fd, const char __user *name, const void __user *value, size_t size, int flags); asmlinkage long sys_getxattr(const char __user *path, const char __user *name, void __user *value, size_t size); +asmlinkage long sys_getxattrat(int dfd, const char __user *path, unsigned int at_flags, + const char __user *name, + struct xattr_args __user *args, size_t size); asmlinkage long sys_lgetxattr(const char __user *path, const char __user *name, void __user *value, size_t size); asmlinkage long sys_fgetxattr(int fd, const char __user *name, void __user *value, size_t size); asmlinkage long sys_listxattr(const char __user *path, char __user *list, size_t size); +asmlinkage long sys_listxattrat(int dfd, const char __user *path, + unsigned int at_flags, + char __user *list, size_t size); asmlinkage long sys_llistxattr(const char __user *path, char __user *list, size_t size); asmlinkage long sys_flistxattr(int fd, char __user *list, size_t size); asmlinkage long sys_removexattr(const char __user *path, const char __user *name); +asmlinkage long sys_removexattrat(int dfd, const char __user *path, + unsigned int at_flags, + const char __user *name); asmlinkage long sys_lremovexattr(const char __user *path, const char __user *name); asmlinkage long sys_fremovexattr(int fd, const char __user *name); diff --git a/include/linux/xattr.h b/include/linux/xattr.h index d20051865800..86b0d47984a1 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -19,6 +19,10 @@ #include #include +/* List of all open_how "versions". */ +#define XATTR_ARGS_SIZE_VER0 16 /* sizeof first published struct */ +#define XATTR_ARGS_SIZE_LATEST XATTR_ARGS_SIZE_VER0 + struct inode; struct dentry; diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 5bf6148cac2b..88dc393c2bca 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -841,8 +841,17 @@ __SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules) #define __NR_mseal 462 __SYSCALL(__NR_mseal, sys_mseal) +#define __NR_setxattrat 463 +__SYSCALL(__NR_setxattrat, sys_setxattrat) +#define __NR_getxattrat 464 +__SYSCALL(__NR_getxattrat, sys_getxattrat) +#define __NR_listxattrat 465 +__SYSCALL(__NR_listxattrat, sys_listxattrat) +#define __NR_removexattrat 466 +__SYSCALL(__NR_removexattrat, sys_removexattrat) + #undef __NR_syscalls -#define __NR_syscalls 463 +#define __NR_syscalls 467 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/xattr.h b/include/uapi/linux/xattr.h index 9463db2dfa9d..9854f9cff3c6 100644 --- a/include/uapi/linux/xattr.h +++ b/include/uapi/linux/xattr.h @@ -11,6 +11,7 @@ */ #include +#include #ifndef _UAPI_LINUX_XATTR_H #define _UAPI_LINUX_XATTR_H @@ -20,6 +21,12 @@ #define XATTR_CREATE 0x1 /* set value, fail if attr already exists */ #define XATTR_REPLACE 0x2 /* set value, fail if attr does not exist */ + +struct xattr_args { + __aligned_u64 __user value; + __u32 size; + __u32 flags; +}; #endif /* Namespaces */ From 1a61c9d6ec1d4a485c025904acff19f93765f8b6 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 7 Aug 2024 10:31:42 +0200 Subject: [PATCH 070/168] xattr: handle AT_EMPTY_PATH when setting xattrs Don't allow O_PATH file descriptors to be used to set xattrs. Signed-off-by: Christian Brauner --- fs/xattr.c | 63 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 27 deletions(-) diff --git a/fs/xattr.c b/fs/xattr.c index a95e8aecb7ab..abd4cb124759 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -630,6 +630,38 @@ int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, ctx->kvalue, ctx->size, ctx->flags); } +static int do_fsetxattr(int fd, const char __user *name, + const void __user *value, size_t size, int flags) +{ + struct xattr_name kname; + struct kernel_xattr_ctx ctx = { + .cvalue = value, + .kvalue = NULL, + .size = size, + .kname = &kname, + .flags = flags, + }; + int error; + + CLASS(fd, f)(fd); + if (!f.file) + return -EBADF; + + audit_file(f.file); + error = setxattr_copy(name, &ctx); + if (error) + return error; + + error = mnt_want_write_file(f.file); + if (!error) { + error = do_setxattr(file_mnt_idmap(f.file), + f.file->f_path.dentry, &ctx); + mnt_drop_write_file(f.file); + } + kvfree(ctx.kvalue); + return error; +} + static int path_setxattrat(int dfd, const char __user *pathname, unsigned int at_flags, const char __user *name, const void __user *value, size_t size, int flags) @@ -649,6 +681,9 @@ static int path_setxattrat(int dfd, const char __user *pathname, if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; + if (at_flags & AT_EMPTY_PATH && vfs_empty_path(dfd, pathname)) + return do_fsetxattr(dfd, name, value, size, flags); + lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; if (at_flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; @@ -719,33 +754,7 @@ SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname, SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, const void __user *,value, size_t, size, int, flags) { - struct xattr_name kname; - struct kernel_xattr_ctx ctx = { - .cvalue = value, - .kvalue = NULL, - .size = size, - .kname = &kname, - .flags = flags, - }; - int error; - - CLASS(fd, f)(fd); - if (!f.file) - return -EBADF; - - audit_file(f.file); - error = setxattr_copy(name, &ctx); - if (error) - return error; - - error = mnt_want_write_file(f.file); - if (!error) { - error = do_setxattr(file_mnt_idmap(f.file), - f.file->f_path.dentry, &ctx); - mnt_drop_write_file(f.file); - } - kvfree(ctx.kvalue); - return error; + return do_fsetxattr(fd, name, value, size, flags); } /* From 278397b2c592b08f207d6d91bd1c60b14823fdf6 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 7 Aug 2024 10:33:16 +0200 Subject: [PATCH 071/168] xattr: handle AT_EMPTY_PATH when getting xattrs Don't allow O_PATH file descriptors to be get xattrs. Signed-off-by: Christian Brauner --- fs/xattr.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/xattr.c b/fs/xattr.c index abd4cb124759..54fee383b04f 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -22,6 +22,7 @@ #include #include #include +#include #include @@ -828,9 +829,19 @@ static ssize_t path_getxattrat(int dfd, const char __user *pathname, if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; + if (at_flags & AT_EMPTY_PATH && vfs_empty_path(dfd, pathname)) { + CLASS(fd, f)(dfd); + if (!f.file) + return -EBADF; + audit_file(f.file); + return getxattr(file_mnt_idmap(f.file), file_dentry(f.file), + name, value, size); + } + lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; if (at_flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; + retry: error = user_path_at(dfd, pathname, lookup_flags, &path); if (error) From 5560ab7ee32eff54ce278cb7905471f90c05abe5 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 7 Aug 2024 10:33:49 +0200 Subject: [PATCH 072/168] xattr: handle AT_EMPTY_PATH when listing xattrs Don't allow O_PATH file descriptors to be used to list xattrs. Signed-off-by: Christian Brauner --- fs/xattr.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/xattr.c b/fs/xattr.c index 54fee383b04f..e053090366f6 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -951,6 +951,14 @@ static ssize_t path_listxattrat(int dfd, const char __user *pathname, if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; + if (at_flags & AT_EMPTY_PATH && vfs_empty_path(dfd, pathname)) { + CLASS(fd, f)(dfd); + if (!f.file) + return -EBADF; + audit_file(f.file); + return listxattr(file_dentry(f.file), list, size); + } + lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; if (at_flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; From 33fce6444e7d8f98e896bf3843ead87ce142e1f9 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 7 Aug 2024 10:35:46 +0200 Subject: [PATCH 073/168] xattr: handle AT_EMPTY_PATH when removing xattrs Don't allow O_PATH file descriptors to be used to remove xattrs. Signed-off-by: Christian Brauner --- fs/xattr.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/fs/xattr.c b/fs/xattr.c index e053090366f6..0bbd5bdf3c7d 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -1018,6 +1018,31 @@ removexattr(struct mnt_idmap *idmap, struct dentry *d, const char *name) return vfs_removexattr(idmap, d, name); } +static int do_fremovexattr(int fd, const char __user *name) +{ + char kname[XATTR_NAME_MAX + 1]; + int error = -EBADF; + + CLASS(fd, f)(fd); + if (!f.file) + return error; + audit_file(f.file); + + error = strncpy_from_user(kname, name, sizeof(kname)); + if (error == 0 || error == sizeof(kname)) + error = -ERANGE; + if (error < 0) + return error; + + error = mnt_want_write_file(f.file); + if (!error) { + error = removexattr(file_mnt_idmap(f.file), + f.file->f_path.dentry, kname); + mnt_drop_write_file(f.file); + } + return error; +} + static int path_removexattrat(int dfd, const char __user *pathname, unsigned int at_flags, const char __user *name) { @@ -1029,6 +1054,9 @@ static int path_removexattrat(int dfd, const char __user *pathname, if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; + if (at_flags & AT_EMPTY_PATH && vfs_empty_path(dfd, pathname)) + return do_fremovexattr(dfd, name); + lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; if (at_flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; From 346574e1e6d94fbc34caf8fa4c657330b9ed780a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 9 Aug 2024 13:09:28 +0200 Subject: [PATCH 074/168] xattr: don't raise LOOKUP_EMPTY Now that we handle AT_EMPTY_PATH for *xattrat() directly without going any extra lookup work we know that if we didn't take the fastpath that the provided path isn't empty and so we don't need to raise LOOKUP_EMPTY. This also guards against the case where through some race the provided user path somehow became empty after the check (e.g., racing thread messing with /proc//mem of the process performing the xattr call). If that were to happen and LOOKUP_EMPTY is raised and the provided fd is an O_PATH fd we would accidently allow reading, listing, writing, and removing xattrs through O_PATH file descriptors. We don't want to do that since no read-write like operations should be allowed with O_PATH file descriptors. Signed-off-by: Christian Brauner --- fs/xattr.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/fs/xattr.c b/fs/xattr.c index 0bbd5bdf3c7d..70ab6e28cc17 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -686,8 +686,6 @@ static int path_setxattrat(int dfd, const char __user *pathname, return do_fsetxattr(dfd, name, value, size, flags); lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; - if (at_flags & AT_EMPTY_PATH) - lookup_flags |= LOOKUP_EMPTY; error = setxattr_copy(name, &ctx); if (error) @@ -839,8 +837,6 @@ static ssize_t path_getxattrat(int dfd, const char __user *pathname, } lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; - if (at_flags & AT_EMPTY_PATH) - lookup_flags |= LOOKUP_EMPTY; retry: error = user_path_at(dfd, pathname, lookup_flags, &path); @@ -960,8 +956,6 @@ static ssize_t path_listxattrat(int dfd, const char __user *pathname, } lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; - if (at_flags & AT_EMPTY_PATH) - lookup_flags |= LOOKUP_EMPTY; retry: error = user_path_at(dfd, pathname, lookup_flags, &path); if (error) @@ -1058,8 +1052,6 @@ static int path_removexattrat(int dfd, const char __user *pathname, return do_fremovexattr(dfd, name); lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; - if (at_flags & AT_EMPTY_PATH) - lookup_flags |= LOOKUP_EMPTY; error = strncpy_from_user(kname, name, sizeof(kname)); if (error == 0 || error == sizeof(kname)) From cc3e8969ffb2ce2560d0206b688a59bb4bf0fcc3 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Thu, 15 Aug 2024 11:24:18 +0200 Subject: [PATCH 075/168] fs/namespace: introduce SB_I_NOIDMAP flag Right now we determine if filesystem support vfs idmappings or not basing on the FS_ALLOW_IDMAP flag presence. This "static" way works perfecly well for local filesystems like ext4, xfs, btrfs, etc. But for network-like filesystems like fuse, cephfs this approach is not ideal, because sometimes proper support of vfs idmaps requires some extensions for the on-wire protocol, which implies that changes have to be made not only in the Linux kernel code but also in the 3rd party components like libfuse, cephfs MDS server and so on. We have seen that issue during our work on cephfs idmapped mounts [1] with Christian, but right now I'm working on the idmapped mounts support for fuse/virtiofs and I think that it is a right time for this extension. [1] 5ccd8530dd7 ("ceph: handle idmapped mounts in create_request_message()") Cc: Christian Brauner Cc: Seth Forshee Cc: Miklos Szeredi Cc: Amir Goldstein Cc: Bernd Schubert Cc: linux-fsdevel@vger.kernel.org Suggested-by: Christian Brauner Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/r/20240815092429.103356-2-aleksandr.mikhalitsyn@canonical.com Reviewed-by: Christian Brauner Signed-off-by: Christian Brauner --- fs/namespace.c | 4 ++++ include/linux/fs.h | 1 + 2 files changed, 5 insertions(+) diff --git a/fs/namespace.c b/fs/namespace.c index 328087a4df8a..d1702285c915 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4436,6 +4436,10 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP)) return -EINVAL; + /* The filesystem has turned off idmapped mounts. */ + if (m->mnt_sb->s_iflags & SB_I_NOIDMAP) + return -EINVAL; + /* We're not controlling the superblock. */ if (!ns_capable(fs_userns, CAP_SYS_ADMIN)) return -EPERM; diff --git a/include/linux/fs.h b/include/linux/fs.h index fd34b5755c0b..6ff547ef21f2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1189,6 +1189,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ #define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */ #define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */ +#define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */ /* Possible states of 'frozen' field */ enum { From d2c5937035e563bf1bf5c1a8b0ef7a1958f0028e Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Thu, 15 Aug 2024 11:24:19 +0200 Subject: [PATCH 076/168] fs/fuse: add FUSE_OWNER_UID_GID_EXT extension To properly support vfs idmappings we need to provide a fuse daemon with the correct owner uid/gid for inode creation requests like mkdir, mknod, atomic_open, symlink. Right now, fuse daemons use req->in.h.uid/req->in.h.gid to set inode owner. These fields contain fsuid/fsgid of the syscall's caller. And that's perfectly fine, because inode owner have to be set to these values. But, for idmapped mounts it's not the case and caller fsuid/fsgid != inode owner, because idmapped mounts do nothing with the caller fsuid/fsgid, but affect inode owner uid/gid. It means that we can't apply vfsid mapping to caller fsuid/fsgid, but instead we have to introduce a new fields to store inode owner uid/gid which will be appropriately transformed. Christian and I have done the same to support idmapped mounts in the cephfs recently [1]. [1] 5ccd8530 ("ceph: handle idmapped mounts in create_request_message()") Cc: Miklos Szeredi Cc: Christian Brauner Cc: Seth Forshee Cc: Amir Goldstein Cc: Bernd Schubert Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/r/20240815092429.103356-3-aleksandr.mikhalitsyn@canonical.com Reviewed-by: Christian Brauner Signed-off-by: Christian Brauner --- fs/fuse/dir.c | 34 +++++++++++++++++++++++++++++++--- fs/fuse/fuse_i.h | 3 +++ fs/fuse/inode.c | 4 +++- include/uapi/linux/fuse.h | 19 +++++++++++++++++++ 4 files changed, 56 insertions(+), 4 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 2b0d4781f394..30d27d4f3b5a 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -572,7 +572,33 @@ static int get_create_supp_group(struct inode *dir, struct fuse_in_arg *ext) return 0; } -static int get_create_ext(struct fuse_args *args, +static int get_owner_uid_gid(struct mnt_idmap *idmap, struct fuse_conn *fc, struct fuse_in_arg *ext) +{ + struct fuse_ext_header *xh; + struct fuse_owner_uid_gid *owner_creds; + u32 owner_creds_len = fuse_ext_size(sizeof(*owner_creds)); + kuid_t owner_fsuid; + kgid_t owner_fsgid; + + xh = extend_arg(ext, owner_creds_len); + if (!xh) + return -ENOMEM; + + xh->size = owner_creds_len; + xh->type = FUSE_EXT_OWNER_UID_GID; + + owner_creds = (struct fuse_owner_uid_gid *) &xh[1]; + + owner_fsuid = mapped_fsuid(idmap, fc->user_ns); + owner_fsgid = mapped_fsgid(idmap, fc->user_ns); + owner_creds->uid = from_kuid(fc->user_ns, owner_fsuid); + owner_creds->gid = from_kgid(fc->user_ns, owner_fsgid); + + return 0; +} + +static int get_create_ext(struct mnt_idmap *idmap, + struct fuse_args *args, struct inode *dir, struct dentry *dentry, umode_t mode) { @@ -584,6 +610,8 @@ static int get_create_ext(struct fuse_args *args, err = get_security_context(dentry, mode, &ext); if (!err && fc->create_supp_group) err = get_create_supp_group(dir, &ext); + if (!err && fc->owner_uid_gid_ext) + err = get_owner_uid_gid(idmap, fc, &ext); if (!err && ext.size) { WARN_ON(args->in_numargs >= ARRAY_SIZE(args->in_args)); @@ -668,7 +696,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, args.out_args[1].size = sizeof(*outopenp); args.out_args[1].value = outopenp; - err = get_create_ext(&args, dir, entry, mode); + err = get_create_ext(&nop_mnt_idmap, &args, dir, entry, mode); if (err) goto out_put_forget_req; @@ -798,7 +826,7 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, args->out_args[0].value = &outarg; if (args->opcode != FUSE_LINK) { - err = get_create_ext(args, dir, entry, mode); + err = get_create_ext(&nop_mnt_idmap, args, dir, entry, mode); if (err) goto out_put_forget_req; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index f23919610313..d06934e70cc5 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -845,6 +845,9 @@ struct fuse_conn { /* Add supplementary group info when creating a new inode */ unsigned int create_supp_group:1; + /* Add owner_{u,g}id info when creating a new inode */ + unsigned int owner_uid_gid_ext:1; + /* Does the filesystem support per inode DAX? */ unsigned int inode_dax:1; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index d8ab4e93916f..6c205731c844 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1343,6 +1343,8 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, } if (flags & FUSE_NO_EXPORT_SUPPORT) fm->sb->s_export_op = &fuse_export_fid_operations; + if (flags & FUSE_OWNER_UID_GID_EXT) + fc->owner_uid_gid_ext = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1390,7 +1392,7 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP | - FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND; + FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND | FUSE_OWNER_UID_GID_EXT; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index d08b99d60f6f..d9ecc17fd13b 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -217,6 +217,10 @@ * - add backing_id to fuse_open_out, add FOPEN_PASSTHROUGH open flag * - add FUSE_NO_EXPORT_SUPPORT init flag * - add FUSE_NOTIFY_RESEND, add FUSE_HAS_RESEND init flag + * + * 7.41 + * - add FUSE_EXT_OWNER_UID_GID + * - add FUSE_OWNER_UID_GID_EXT */ #ifndef _LINUX_FUSE_H @@ -421,6 +425,8 @@ struct fuse_file_lock { * FUSE_NO_EXPORT_SUPPORT: explicitly disable export support * FUSE_HAS_RESEND: kernel supports resending pending requests, and the high bit * of the request ID indicates resend requests + * FUSE_OWNER_UID_GID_EXT: add inode owner UID/GID info to create, mkdir, + * symlink and mknod */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -466,6 +472,7 @@ struct fuse_file_lock { /* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */ #define FUSE_DIRECT_IO_RELAX FUSE_DIRECT_IO_ALLOW_MMAP +#define FUSE_OWNER_UID_GID_EXT (1ULL << 40) /** * CUSE INIT request/reply flags @@ -575,11 +582,13 @@ struct fuse_file_lock { * extension type * FUSE_MAX_NR_SECCTX: maximum value of &fuse_secctx_header.nr_secctx * FUSE_EXT_GROUPS: &fuse_supp_groups extension + * FUSE_EXT_OWNER_UID_GID: &fuse_owner_uid_gid extension */ enum fuse_ext_type { /* Types 0..31 are reserved for fuse_secctx_header */ FUSE_MAX_NR_SECCTX = 31, FUSE_EXT_GROUPS = 32, + FUSE_EXT_OWNER_UID_GID = 33, }; enum fuse_opcode { @@ -1186,4 +1195,14 @@ struct fuse_supp_groups { uint32_t groups[]; }; +/** + * struct fuse_owner_uid_gid - Inode owner UID/GID extension + * @uid: inode owner UID + * @gid: inode owner GID + */ +struct fuse_owner_uid_gid { + uint32_t uid; + uint32_t gid; +}; + #endif /* _LINUX_FUSE_H */ From 9961d396252bd9e45ee883654626ad4e5d1953c5 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Thu, 15 Aug 2024 11:24:20 +0200 Subject: [PATCH 077/168] fs/fuse: support idmap for mkdir/mknod/symlink/create We have all the infrastructure in place, we just need to pass an idmapping here. Cc: Christian Brauner Cc: Seth Forshee Cc: Miklos Szeredi Cc: Amir Goldstein Cc: Bernd Schubert Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/r/20240815092429.103356-4-aleksandr.mikhalitsyn@canonical.com Reviewed-by: Christian Brauner Signed-off-by: Christian Brauner --- fs/fuse/dir.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 30d27d4f3b5a..1e45c6157af4 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -637,9 +637,9 @@ static void free_ext_value(struct fuse_args *args) * If the filesystem doesn't support this, then fall back to separate * 'mknod' + 'open' requests. */ -static int fuse_create_open(struct inode *dir, struct dentry *entry, - struct file *file, unsigned int flags, - umode_t mode, u32 opcode) +static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *entry, struct file *file, + unsigned int flags, umode_t mode, u32 opcode) { int err; struct inode *inode; @@ -696,7 +696,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, args.out_args[1].size = sizeof(*outopenp); args.out_args[1].value = outopenp; - err = get_create_ext(&nop_mnt_idmap, &args, dir, entry, mode); + err = get_create_ext(idmap, &args, dir, entry, mode); if (err) goto out_put_forget_req; @@ -757,6 +757,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, umode_t mode) { int err; + struct mnt_idmap *idmap = file_mnt_idmap(file); struct fuse_conn *fc = get_fuse_conn(dir); struct dentry *res = NULL; @@ -781,7 +782,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, if (fc->no_create) goto mknod; - err = fuse_create_open(dir, entry, file, flags, mode, FUSE_CREATE); + err = fuse_create_open(idmap, dir, entry, file, flags, mode, FUSE_CREATE); if (err == -ENOSYS) { fc->no_create = 1; goto mknod; @@ -792,7 +793,7 @@ out_dput: return err; mknod: - err = fuse_mknod(&nop_mnt_idmap, dir, entry, mode, 0); + err = fuse_mknod(idmap, dir, entry, mode, 0); if (err) goto out_dput; no_open: @@ -802,9 +803,9 @@ no_open: /* * Code shared between mknod, mkdir, symlink and link */ -static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, - struct inode *dir, struct dentry *entry, - umode_t mode) +static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm, + struct fuse_args *args, struct inode *dir, + struct dentry *entry, umode_t mode) { struct fuse_entry_out outarg; struct inode *inode; @@ -826,7 +827,7 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, args->out_args[0].value = &outarg; if (args->opcode != FUSE_LINK) { - err = get_create_ext(&nop_mnt_idmap, args, dir, entry, mode); + err = get_create_ext(idmap, args, dir, entry, mode); if (err) goto out_put_forget_req; } @@ -892,13 +893,13 @@ static int fuse_mknod(struct mnt_idmap *idmap, struct inode *dir, args.in_args[0].value = &inarg; args.in_args[1].size = entry->d_name.len + 1; args.in_args[1].value = entry->d_name.name; - return create_new_entry(fm, &args, dir, entry, mode); + return create_new_entry(idmap, fm, &args, dir, entry, mode); } static int fuse_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, umode_t mode, bool excl) { - return fuse_mknod(&nop_mnt_idmap, dir, entry, mode, 0); + return fuse_mknod(idmap, dir, entry, mode, 0); } static int fuse_tmpfile(struct mnt_idmap *idmap, struct inode *dir, @@ -910,7 +911,7 @@ static int fuse_tmpfile(struct mnt_idmap *idmap, struct inode *dir, if (fc->no_tmpfile) return -EOPNOTSUPP; - err = fuse_create_open(dir, file->f_path.dentry, file, file->f_flags, mode, FUSE_TMPFILE); + err = fuse_create_open(idmap, dir, file->f_path.dentry, file, file->f_flags, mode, FUSE_TMPFILE); if (err == -ENOSYS) { fc->no_tmpfile = 1; err = -EOPNOTSUPP; @@ -937,7 +938,7 @@ static int fuse_mkdir(struct mnt_idmap *idmap, struct inode *dir, args.in_args[0].value = &inarg; args.in_args[1].size = entry->d_name.len + 1; args.in_args[1].value = entry->d_name.name; - return create_new_entry(fm, &args, dir, entry, S_IFDIR); + return create_new_entry(idmap, fm, &args, dir, entry, S_IFDIR); } static int fuse_symlink(struct mnt_idmap *idmap, struct inode *dir, @@ -953,7 +954,7 @@ static int fuse_symlink(struct mnt_idmap *idmap, struct inode *dir, args.in_args[0].value = entry->d_name.name; args.in_args[1].size = len; args.in_args[1].value = link; - return create_new_entry(fm, &args, dir, entry, S_IFLNK); + return create_new_entry(idmap, fm, &args, dir, entry, S_IFLNK); } void fuse_flush_time_update(struct inode *inode) @@ -1147,7 +1148,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, args.in_args[0].value = &inarg; args.in_args[1].size = newent->d_name.len + 1; args.in_args[1].value = newent->d_name.name; - err = create_new_entry(fm, &args, newdir, newent, inode->i_mode); + err = create_new_entry(&nop_mnt_idmap, fm, &args, newdir, newent, inode->i_mode); if (!err) fuse_update_ctime_in_cache(inode); else if (err == -EINTR) From 52dfd148ff75e75842e233135e128eb3fdc33cb2 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Thu, 15 Aug 2024 11:24:21 +0200 Subject: [PATCH 078/168] fs/fuse: support idmapped getattr inode op We have to: - pass an idmapping to the generic_fillattr() to properly handle UIG/GID mapping for the userspace. - pass -/- to fuse_fillattr() (analog of generic_fillattr() in fuse). Difference between these two is that generic_fillattr() takes all the stat() data from the inode directly, while fuse_fillattr() codepath takes a fresh data just from the userspace reply on the FUSE_GETATTR request. In some cases we can just pass &nop_mnt_idmap, because idmapping won't be used in these codepaths. For example, when 3rd argument of fuse_do_getattr() is NULL then idmap argument is not used. Cc: Christian Brauner Cc: Seth Forshee Cc: Miklos Szeredi Cc: Amir Goldstein Cc: Bernd Schubert Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/r/20240815092429.103356-5-aleksandr.mikhalitsyn@canonical.com Reviewed-by: Christian Brauner Signed-off-by: Christian Brauner --- fs/fuse/dir.c | 44 ++++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 1e45c6157af4..a5bf8c18a0ae 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1157,18 +1157,22 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, return err; } -static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, - struct kstat *stat) +static void fuse_fillattr(struct mnt_idmap *idmap, struct inode *inode, + struct fuse_attr *attr, struct kstat *stat) { unsigned int blkbits; struct fuse_conn *fc = get_fuse_conn(inode); + vfsuid_t vfsuid = make_vfsuid(idmap, fc->user_ns, + make_kuid(fc->user_ns, attr->uid)); + vfsgid_t vfsgid = make_vfsgid(idmap, fc->user_ns, + make_kgid(fc->user_ns, attr->gid)); stat->dev = inode->i_sb->s_dev; stat->ino = attr->ino; stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); stat->nlink = attr->nlink; - stat->uid = make_kuid(fc->user_ns, attr->uid); - stat->gid = make_kgid(fc->user_ns, attr->gid); + stat->uid = vfsuid_into_kuid(vfsuid); + stat->gid = vfsgid_into_kgid(vfsgid); stat->rdev = inode->i_rdev; stat->atime.tv_sec = attr->atime; stat->atime.tv_nsec = attr->atimensec; @@ -1207,8 +1211,8 @@ static void fuse_statx_to_attr(struct fuse_statx *sx, struct fuse_attr *attr) attr->blksize = sx->blksize; } -static int fuse_do_statx(struct inode *inode, struct file *file, - struct kstat *stat) +static int fuse_do_statx(struct mnt_idmap *idmap, struct inode *inode, + struct file *file, struct kstat *stat) { int err; struct fuse_attr attr; @@ -1261,15 +1265,15 @@ static int fuse_do_statx(struct inode *inode, struct file *file, stat->result_mask = sx->mask & (STATX_BASIC_STATS | STATX_BTIME); stat->btime.tv_sec = sx->btime.tv_sec; stat->btime.tv_nsec = min_t(u32, sx->btime.tv_nsec, NSEC_PER_SEC - 1); - fuse_fillattr(inode, &attr, stat); + fuse_fillattr(idmap, inode, &attr, stat); stat->result_mask |= STATX_TYPE; } return 0; } -static int fuse_do_getattr(struct inode *inode, struct kstat *stat, - struct file *file) +static int fuse_do_getattr(struct mnt_idmap *idmap, struct inode *inode, + struct kstat *stat, struct file *file) { int err; struct fuse_getattr_in inarg; @@ -1308,15 +1312,15 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, ATTR_TIMEOUT(&outarg), attr_version); if (stat) - fuse_fillattr(inode, &outarg.attr, stat); + fuse_fillattr(idmap, inode, &outarg.attr, stat); } } return err; } -static int fuse_update_get_attr(struct inode *inode, struct file *file, - struct kstat *stat, u32 request_mask, - unsigned int flags) +static int fuse_update_get_attr(struct mnt_idmap *idmap, struct inode *inode, + struct file *file, struct kstat *stat, + u32 request_mask, unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); @@ -1347,17 +1351,17 @@ retry: forget_all_cached_acls(inode); /* Try statx if BTIME is requested */ if (!fc->no_statx && (request_mask & ~STATX_BASIC_STATS)) { - err = fuse_do_statx(inode, file, stat); + err = fuse_do_statx(idmap, inode, file, stat); if (err == -ENOSYS) { fc->no_statx = 1; err = 0; goto retry; } } else { - err = fuse_do_getattr(inode, stat, file); + err = fuse_do_getattr(idmap, inode, stat, file); } } else if (stat) { - generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); stat->mode = fi->orig_i_mode; stat->ino = fi->orig_ino; if (test_bit(FUSE_I_BTIME, &fi->state)) { @@ -1371,7 +1375,7 @@ retry: int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask) { - return fuse_update_get_attr(inode, file, NULL, mask, 0); + return fuse_update_get_attr(&nop_mnt_idmap, inode, file, NULL, mask, 0); } int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, @@ -1515,7 +1519,7 @@ static int fuse_perm_getattr(struct inode *inode, int mask) return -ECHILD; forget_all_cached_acls(inode); - return fuse_do_getattr(inode, NULL, NULL); + return fuse_do_getattr(&nop_mnt_idmap, inode, NULL, NULL); } /* @@ -2094,7 +2098,7 @@ static int fuse_setattr(struct mnt_idmap *idmap, struct dentry *entry, * ia_mode calculation may have used stale i_mode. * Refresh and recalculate. */ - ret = fuse_do_getattr(inode, NULL, file); + ret = fuse_do_getattr(idmap, inode, NULL, file); if (ret) return ret; @@ -2151,7 +2155,7 @@ static int fuse_getattr(struct mnt_idmap *idmap, return -EACCES; } - return fuse_update_get_attr(inode, NULL, stat, request_mask, flags); + return fuse_update_get_attr(idmap, inode, NULL, stat, request_mask, flags); } static const struct inode_operations fuse_dir_inode_operations = { From 34ddf0de71be7aa332af42bbfe0cd2a144c61bee Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Thu, 15 Aug 2024 11:24:22 +0200 Subject: [PATCH 079/168] fs/fuse: support idmapped ->permission inode op We only cover the case when "default_permissions" flag is used. A reason for that is that otherwise all the permission checks are done in the userspace and we have to deal with VFS idmapping in the userspace (which is bad), alternatively we have to provide the userspace with idmapped req->in.h.uid/req->in.h.gid which is also not align with VFS idmaps philosophy. Cc: Christian Brauner Cc: Seth Forshee Cc: Miklos Szeredi Cc: Amir Goldstein Cc: Bernd Schubert Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/r/20240815092429.103356-6-aleksandr.mikhalitsyn@canonical.com Reviewed-by: Christian Brauner Signed-off-by: Christian Brauner --- fs/fuse/dir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index a5bf8c18a0ae..cd3b91b60cae 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1567,7 +1567,7 @@ static int fuse_permission(struct mnt_idmap *idmap, } if (fc->default_permissions) { - err = generic_permission(&nop_mnt_idmap, inode, mask); + err = generic_permission(idmap, inode, mask); /* If permission is denied, try to refresh file attributes. This is also needed, because the root @@ -1575,7 +1575,7 @@ static int fuse_permission(struct mnt_idmap *idmap, if (err == -EACCES && !refreshed) { err = fuse_perm_getattr(inode, mask); if (!err) - err = generic_permission(&nop_mnt_idmap, + err = generic_permission(idmap, inode, mask); } From 27b622529cdcbbcb37da189c20e8288741dc31ea Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Thu, 15 Aug 2024 11:24:23 +0200 Subject: [PATCH 080/168] fs/fuse: support idmapped ->setattr op Cc: Christian Brauner Cc: Seth Forshee Cc: Miklos Szeredi Cc: Amir Goldstein Cc: Bernd Schubert Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/r/20240815092429.103356-7-aleksandr.mikhalitsyn@canonical.com Reviewed-by: Christian Brauner Signed-off-by: Christian Brauner --- fs/fuse/dir.c | 32 +++++++++++++++++++++----------- fs/fuse/file.c | 2 +- fs/fuse/fuse_i.h | 4 ++-- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index cd3b91b60cae..c50f951596dd 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1771,17 +1771,27 @@ static bool update_mtime(unsigned ivalid, bool trust_local_mtime) return true; } -static void iattr_to_fattr(struct fuse_conn *fc, struct iattr *iattr, - struct fuse_setattr_in *arg, bool trust_local_cmtime) +static void iattr_to_fattr(struct mnt_idmap *idmap, struct fuse_conn *fc, + struct iattr *iattr, struct fuse_setattr_in *arg, + bool trust_local_cmtime) { unsigned ivalid = iattr->ia_valid; if (ivalid & ATTR_MODE) arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode; - if (ivalid & ATTR_UID) - arg->valid |= FATTR_UID, arg->uid = from_kuid(fc->user_ns, iattr->ia_uid); - if (ivalid & ATTR_GID) - arg->valid |= FATTR_GID, arg->gid = from_kgid(fc->user_ns, iattr->ia_gid); + + if (ivalid & ATTR_UID) { + kuid_t fsuid = from_vfsuid(idmap, fc->user_ns, iattr->ia_vfsuid); + arg->valid |= FATTR_UID; + arg->uid = from_kuid(fc->user_ns, fsuid); + } + + if (ivalid & ATTR_GID) { + kgid_t fsgid = from_vfsgid(idmap, fc->user_ns, iattr->ia_vfsgid); + arg->valid |= FATTR_GID; + arg->gid = from_kgid(fc->user_ns, fsgid); + } + if (ivalid & ATTR_SIZE) arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size; if (ivalid & ATTR_ATIME) { @@ -1901,8 +1911,8 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff) * vmtruncate() doesn't allow for this case, so do the rlimit checking * and the actual truncation by hand. */ -int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, - struct file *file) +int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr, struct file *file) { struct inode *inode = d_inode(dentry); struct fuse_mount *fm = get_fuse_mount(inode); @@ -1922,7 +1932,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, if (!fc->default_permissions) attr->ia_valid |= ATTR_FORCE; - err = setattr_prepare(&nop_mnt_idmap, dentry, attr); + err = setattr_prepare(idmap, dentry, attr); if (err) return err; @@ -1981,7 +1991,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); - iattr_to_fattr(fc, attr, &inarg, trust_local_cmtime); + iattr_to_fattr(idmap, fc, attr, &inarg, trust_local_cmtime); if (file) { struct fuse_file *ff = file->private_data; inarg.valid |= FATTR_FH; @@ -2116,7 +2126,7 @@ static int fuse_setattr(struct mnt_idmap *idmap, struct dentry *entry, if (!attr->ia_valid) return 0; - ret = fuse_do_setattr(entry, attr, file); + ret = fuse_do_setattr(idmap, entry, attr, file); if (!ret) { /* * If filesystem supports acls it may have updated acl xattrs in diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f39456c65ed7..562bdf8d5976 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2966,7 +2966,7 @@ static void fuse_do_truncate(struct file *file) attr.ia_file = file; attr.ia_valid |= ATTR_FILE; - fuse_do_setattr(file_dentry(file), &attr, file); + fuse_do_setattr(file_mnt_idmap(file), file_dentry(file), &attr, file); } static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index d06934e70cc5..883151a44d72 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1333,8 +1333,8 @@ bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written); int fuse_flush_times(struct inode *inode, struct fuse_file *ff); int fuse_write_inode(struct inode *inode, struct writeback_control *wbc); -int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, - struct file *file); +int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr, struct file *file); void fuse_set_initialized(struct fuse_conn *fc); From 6d8f2f4fde1377605b0f1521e81ec31c135fe6c2 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Thu, 15 Aug 2024 11:24:24 +0200 Subject: [PATCH 081/168] fs/fuse: drop idmap argument from __fuse_get_acl We don't need to have idmap in the __fuse_get_acl as we don't have any use for it. In the current POSIX ACL implementation, idmapped mounts are taken into account on the userspace/kernel border (see vfs_set_acl_idmapped_mnt() and vfs_posix_acl_to_xattr()). Cc: Christian Brauner Cc: Seth Forshee Cc: Miklos Szeredi Cc: Amir Goldstein Cc: Bernd Schubert Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/r/20240815092429.103356-8-aleksandr.mikhalitsyn@canonical.com Reviewed-by: Christian Brauner Signed-off-by: Christian Brauner --- fs/fuse/acl.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index 04cfd8fee992..897d813c5e92 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -12,7 +12,6 @@ #include static struct posix_acl *__fuse_get_acl(struct fuse_conn *fc, - struct mnt_idmap *idmap, struct inode *inode, int type, bool rcu) { int size; @@ -74,7 +73,7 @@ struct posix_acl *fuse_get_acl(struct mnt_idmap *idmap, if (fuse_no_acl(fc, inode)) return ERR_PTR(-EOPNOTSUPP); - return __fuse_get_acl(fc, idmap, inode, type, false); + return __fuse_get_acl(fc, inode, type, false); } struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu) @@ -90,8 +89,7 @@ struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu) */ if (!fc->posix_acl) return NULL; - - return __fuse_get_acl(fc, &nop_mnt_idmap, inode, type, rcu); + return __fuse_get_acl(fc, inode, type, rcu); } int fuse_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, From ab7c30987cbb50914ea9774fbe3713225748eed5 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Thu, 15 Aug 2024 11:24:25 +0200 Subject: [PATCH 082/168] fs/fuse: support idmapped ->set_acl It's just a matter of adjusting a permission check condition for S_ISGID flag. All the rest is already handled in the generic VFS code. Notice that this permission check is the analog of what we have in posix_acl_update_mode() generic helper, but fuse doesn't use this helper as on the kernel side we don't care about ensuring that POSIX ACL and CHMOD permissions are in sync as it is a responsibility of a userspace daemon to handle that. For the same reason we don't have a calls to posix_acl_chmod(), while most of other filesystem do. Cc: Christian Brauner Cc: Seth Forshee Cc: Miklos Szeredi Cc: Amir Goldstein Cc: Bernd Schubert Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/r/20240815092429.103356-9-aleksandr.mikhalitsyn@canonical.com Reviewed-by: Christian Brauner Signed-off-by: Christian Brauner --- fs/fuse/acl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index 897d813c5e92..8f484b105f13 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -144,8 +144,8 @@ int fuse_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, * be stripped. */ if (fc->posix_acl && - !in_group_or_capable(&nop_mnt_idmap, inode, - i_gid_into_vfsgid(&nop_mnt_idmap, inode))) + !in_group_or_capable(idmap, inode, + i_gid_into_vfsgid(idmap, inode))) extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID; ret = fuse_setxattr(inode, name, value, size, 0, extra_flags); From 76c0baad3782c20060eac12b630230cefd412fdb Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Thu, 15 Aug 2024 11:24:26 +0200 Subject: [PATCH 083/168] fs/fuse: properly handle idmapped ->rename op Support of RENAME_WHITEOUT with idmapped mounts requires an API extension for FUSE_RENAME2. Let's just forbid this combination for now. It's not critical at all as it's only needed for overlayfs on top of fuse/virtiofs. Choice of EINVAL is not random, we just simulate a standard behavior when RENAME_WHITEOUT flag is not supported. Cc: Christian Brauner Cc: Seth Forshee Cc: Miklos Szeredi Cc: Amir Goldstein Cc: Bernd Schubert Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/r/20240815092429.103356-10-aleksandr.mikhalitsyn@canonical.com Reviewed-by: Christian Brauner Signed-off-by: Christian Brauner --- fs/fuse/dir.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index c50f951596dd..0cd01f25251f 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1111,6 +1111,9 @@ static int fuse_rename2(struct mnt_idmap *idmap, struct inode *olddir, if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; + if ((flags & RENAME_WHITEOUT) && (idmap != &nop_mnt_idmap)) + return -EINVAL; + if (flags) { if (fc->no_rename2 || fc->minor < 23) return -EINVAL; From 9aace2eda1bd4b6213353b01a2728633b153a18a Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Thu, 15 Aug 2024 11:24:27 +0200 Subject: [PATCH 084/168] fs/fuse: allow idmapped mounts Now we have everything in place and we can allow idmapped mounts by setting the FS_ALLOW_IDMAP flag. Notice that real availability of idmapped mounts will depend on the fuse daemon. Fuse daemon have to set FUSE_ALLOW_IDMAP flag in the FUSE_INIT reply. To discuss: - we enable idmapped mounts support only if "default_permissions" mode is enabled, because otherwise we would need to deal with UID/GID mappings in the userspace side OR provide the userspace with idmapped req->in.h.uid/req->in.h.gid values which is not something that we probably want to. Idmapped mounts phylosophy is not about faking caller uid/gid. Some extra links and examples: - libfuse support https://github.com/mihalicyn/libfuse/commits/idmap_support - fuse-overlayfs support: https://github.com/mihalicyn/fuse-overlayfs/commits/idmap_support - cephfs-fuse conversion example https://github.com/mihalicyn/ceph/commits/fuse_idmap - glusterfs conversion example https://github.com/mihalicyn/glusterfs/commits/fuse_idmap Cc: Christian Brauner Cc: Seth Forshee Cc: Miklos Szeredi Cc: Amir Goldstein Cc: Bernd Schubert Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/r/20240815092429.103356-11-aleksandr.mikhalitsyn@canonical.com Reviewed-by: Christian Brauner Signed-off-by: Christian Brauner --- fs/fuse/inode.c | 14 +++++++++++--- include/uapi/linux/fuse.h | 5 ++++- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 6c205731c844..b840189ac8be 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1345,6 +1345,12 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fm->sb->s_export_op = &fuse_export_fid_operations; if (flags & FUSE_OWNER_UID_GID_EXT) fc->owner_uid_gid_ext = 1; + if (flags & FUSE_ALLOW_IDMAP) { + if (fc->owner_uid_gid_ext && fc->default_permissions) + fm->sb->s_iflags &= ~SB_I_NOIDMAP; + else + ok = false; + } } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1392,7 +1398,8 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP | - FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND | FUSE_OWNER_UID_GID_EXT; + FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND | FUSE_OWNER_UID_GID_EXT | + FUSE_ALLOW_IDMAP; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; @@ -1569,6 +1576,7 @@ static void fuse_sb_defaults(struct super_block *sb) sb->s_time_gran = 1; sb->s_export_op = &fuse_export_operations; sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE; + sb->s_iflags |= SB_I_NOIDMAP; if (sb->s_user_ns != &init_user_ns) sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); @@ -1981,7 +1989,7 @@ static void fuse_kill_sb_anon(struct super_block *sb) static struct file_system_type fuse_fs_type = { .owner = THIS_MODULE, .name = "fuse", - .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT, + .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT | FS_ALLOW_IDMAP, .init_fs_context = fuse_init_fs_context, .parameters = fuse_fs_parameters, .kill_sb = fuse_kill_sb_anon, @@ -2002,7 +2010,7 @@ static struct file_system_type fuseblk_fs_type = { .init_fs_context = fuse_init_fs_context, .parameters = fuse_fs_parameters, .kill_sb = fuse_kill_sb_blk, - .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE, + .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("fuseblk"); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index d9ecc17fd13b..b23e8247ce43 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -221,6 +221,7 @@ * 7.41 * - add FUSE_EXT_OWNER_UID_GID * - add FUSE_OWNER_UID_GID_EXT + * - add FUSE_ALLOW_IDMAP */ #ifndef _LINUX_FUSE_H @@ -256,7 +257,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 40 +#define FUSE_KERNEL_MINOR_VERSION 41 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -427,6 +428,7 @@ struct fuse_file_lock { * of the request ID indicates resend requests * FUSE_OWNER_UID_GID_EXT: add inode owner UID/GID info to create, mkdir, * symlink and mknod + * FUSE_ALLOW_IDMAP: allow creation of idmapped mounts */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -473,6 +475,7 @@ struct fuse_file_lock { /* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */ #define FUSE_DIRECT_IO_RELAX FUSE_DIRECT_IO_ALLOW_MMAP #define FUSE_OWNER_UID_GID_EXT (1ULL << 40) +#define FUSE_ALLOW_IDMAP (1ULL << 41) /** * CUSE INIT request/reply flags From 020a698f136c609c19d0ace16516c6e1066fa06b Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Thu, 15 Aug 2024 11:24:28 +0200 Subject: [PATCH 085/168] fs/fuse/virtio_fs: allow idmapped mounts Allow idmapped mounts for virtiofs. It's absolutely safe as for virtiofs we have the same feature negotiation mechanism as for classical fuse filesystems. This does not affect any existing setups anyhow. virtiofsd support: https://gitlab.com/virtio-fs/virtiofsd/-/merge_requests/245 Cc: Christian Brauner Cc: Seth Forshee Cc: Miklos Szeredi Cc: Vivek Goyal Cc: German Maglione Cc: Amir Goldstein Cc: Bernd Schubert Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/r/20240815092429.103356-12-aleksandr.mikhalitsyn@canonical.com Reviewed-by: Christian Brauner Signed-off-by: Christian Brauner --- fs/fuse/virtio_fs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index dd5260141615..7e5bbaef6f76 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1628,6 +1628,7 @@ static struct file_system_type virtio_fs_type = { .name = "virtiofs", .init_fs_context = virtio_fs_init_fs_context, .kill_sb = virtio_kill_sb, + .fs_flags = FS_ALLOW_IDMAP, }; static int virtio_fs_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) From bcc7d11e6c0969208ee056c39692138a3f56d828 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 14 Aug 2024 20:59:12 +0100 Subject: [PATCH 086/168] jffs2: Convert jffs2_do_readpage_nolock to take a folio Both callers now have a folio, so pass it in. No effort is made here to support large folios. Removes several hidden calls to compound_head(), two references to page->index and a use of kmap. Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20240814195915.249871-2-willy@infradead.org Reviewed-by: Zhihao Cheng Signed-off-by: Christian Brauner --- fs/jffs2/file.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index ada572c466f8..13c18ccc13b0 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -77,29 +77,27 @@ const struct address_space_operations jffs2_file_address_operations = .write_end = jffs2_write_end, }; -static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg) +static int jffs2_do_readpage_nolock(struct inode *inode, struct folio *folio) { struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); - unsigned char *pg_buf; + unsigned char *kaddr; int ret; jffs2_dbg(2, "%s(): ino #%lu, page at offset 0x%lx\n", - __func__, inode->i_ino, pg->index << PAGE_SHIFT); + __func__, inode->i_ino, folio->index << PAGE_SHIFT); - BUG_ON(!PageLocked(pg)); + BUG_ON(!folio_test_locked(folio)); - pg_buf = kmap(pg); - /* FIXME: Can kmap fail? */ - - ret = jffs2_read_inode_range(c, f, pg_buf, pg->index << PAGE_SHIFT, + kaddr = kmap_local_folio(folio, 0); + ret = jffs2_read_inode_range(c, f, kaddr, folio->index << PAGE_SHIFT, PAGE_SIZE); + kunmap_local(kaddr); if (!ret) - SetPageUptodate(pg); + folio_mark_uptodate(folio); - flush_dcache_page(pg); - kunmap(pg); + flush_dcache_folio(folio); jffs2_dbg(2, "readpage finished\n"); return ret; @@ -107,7 +105,7 @@ static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg) int __jffs2_read_folio(struct file *file, struct folio *folio) { - int ret = jffs2_do_readpage_nolock(folio->mapping->host, &folio->page); + int ret = jffs2_do_readpage_nolock(folio->mapping->host, folio); folio_unlock(folio); return ret; } @@ -221,7 +219,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, */ if (!folio_test_uptodate(folio)) { mutex_lock(&f->sem); - ret = jffs2_do_readpage_nolock(inode, &folio->page); + ret = jffs2_do_readpage_nolock(inode, folio); mutex_unlock(&f->sem); if (ret) { folio_unlock(folio); From 2da4c51a66cd633d9fc8feb3e98eaa5d9924fd45 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 14 Aug 2024 20:59:13 +0100 Subject: [PATCH 087/168] jffs2: Use a folio in jffs2_garbage_collect_dnode() Call read_cache_folio() instead of read_cache_page() to get the folio containing the page. No attempt is made here to support large folios as I assume that will never be interesting for jffs2. Includes a switch from kmap to kmap_local which looks safe. Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20240814195915.249871-3-willy@infradead.org Reviewed-by: Zhihao Cheng Signed-off-by: Christian Brauner --- fs/jffs2/gc.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c index 5c6602f3c189..822949d0eb00 100644 --- a/fs/jffs2/gc.c +++ b/fs/jffs2/gc.c @@ -1171,7 +1171,7 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era uint32_t alloclen, offset, orig_end, orig_start; int ret = 0; unsigned char *comprbuf = NULL, *writebuf; - struct page *page; + struct folio *folio; unsigned char *pg_ptr; memset(&ri, 0, sizeof(ri)); @@ -1317,25 +1317,25 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era BUG_ON(start > orig_start); } - /* The rules state that we must obtain the page lock *before* f->sem, so + /* The rules state that we must obtain the folio lock *before* f->sem, so * drop f->sem temporarily. Since we also hold c->alloc_sem, nothing's * actually going to *change* so we're safe; we only allow reading. * * It is important to note that jffs2_write_begin() will ensure that its - * page is marked Uptodate before allocating space. That means that if we - * end up here trying to GC the *same* page that jffs2_write_begin() is - * trying to write out, read_cache_page() will not deadlock. */ + * folio is marked uptodate before allocating space. That means that if we + * end up here trying to GC the *same* folio that jffs2_write_begin() is + * trying to write out, read_cache_folio() will not deadlock. */ mutex_unlock(&f->sem); - page = read_cache_page(inode->i_mapping, start >> PAGE_SHIFT, + folio = read_cache_folio(inode->i_mapping, start >> PAGE_SHIFT, __jffs2_read_folio, NULL); - if (IS_ERR(page)) { - pr_warn("read_cache_page() returned error: %ld\n", - PTR_ERR(page)); + if (IS_ERR(folio)) { + pr_warn("read_cache_folio() returned error: %ld\n", + PTR_ERR(folio)); mutex_lock(&f->sem); - return PTR_ERR(page); + return PTR_ERR(folio); } - pg_ptr = kmap(page); + pg_ptr = kmap_local_folio(folio, 0); mutex_lock(&f->sem); offset = start; @@ -1400,7 +1400,6 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era } } - kunmap(page); - put_page(page); + folio_release_kmap(folio, pg_ptr); return ret; } From 820a185896b77814557302b981b092a9e7b36814 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 24 Jul 2024 15:15:35 +0200 Subject: [PATCH 088/168] fcntl: add F_CREATED_QUERY Systemd has a helper called openat_report_new() that returns whether a file was created anew or it already existed before for cases where O_CREAT has to be used without O_EXCL (cf. [1]). That apparently isn't something that's specific to systemd but it's where I noticed it. The current logic is that it first attempts to open the file without O_CREAT | O_EXCL and if it gets ENOENT the helper tries again with both flags. If that succeeds all is well. If it now reports EEXIST it retries. That works fairly well but some corner cases make this more involved. If this operates on a dangling symlink the first openat() without O_CREAT | O_EXCL will return ENOENT but the second openat() with O_CREAT | O_EXCL will fail with EEXIST. The reason is that openat() without O_CREAT | O_EXCL follows the symlink while O_CREAT | O_EXCL doesn't for security reasons. So it's not something we can really change unless we add an explicit opt-in via O_FOLLOW which seems really ugly. The caller could try and use fanotify() to register to listen for creation events in the directory before calling openat(). The caller could then compare the returned tid to its own tid to ensure that even in threaded environments it actually created the file. That might work but is a lot of work for something that should be fairly simple and I'm uncertain about it's reliability. The caller could use a bpf lsm hook to hook into security_file_open() to figure out whether they created the file. That also seems a bit wild. So let's add F_CREATED_QUERY which allows the caller to check whether they actually did create the file. That has caveats of course but I don't think they are problematic: * In multi-threaded environments a thread can only be sure that it did create the file if it calls openat() with O_CREAT. In other words, it's obviously not enough to just go through it's fdtable and check these fds because another thread could've created the file. * If there's any codepaths where an openat() with O_CREAT would yield the same struct file as that of another thread it would obviously cause wrong results. I'm not aware of any such codepaths from openat() itself. Imho, that would be a bug. * Related to the previous point, calling the new fcntl() on files created and opened via special-purpose system calls or ioctl()s would cause wrong results only if the affected subsystem a) raises FMODE_CREATED and b) may return the same struct file for two different calls. I'm not seeing anything outside of regular VFS code that raises FMODE_CREATED. There is code for b) in e.g., the drm layer where the same struct file is resurfaced but again FMODE_CREATED isn't used and it would be very misleading if it did. Link: https://github.com/systemd/systemd/blob/11d5e2b5fbf9f6bfa5763fd45b56829ad4f0777f/src/basic/fs-util.c#L1078 [1] Link: https://lore.kernel.org/r/20240724-work-fcntl-v1-1-e8153a2f1991@kernel.org Reviewed-by: Jeff Layton Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fcntl.c | 10 ++++++++++ include/uapi/linux/fcntl.h | 3 +++ 2 files changed, 13 insertions(+) diff --git a/fs/fcntl.c b/fs/fcntl.c index 300e5d9ad913..22ec683ad8f8 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -343,6 +343,12 @@ static long f_dupfd_query(int fd, struct file *filp) return f.file == filp; } +/* Let the caller figure out whether a given file was just created. */ +static long f_created_query(const struct file *filp) +{ + return !!(filp->f_mode & FMODE_CREATED); +} + static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, struct file *filp) { @@ -352,6 +358,9 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, long err = -EINVAL; switch (cmd) { + case F_CREATED_QUERY: + err = f_created_query(filp); + break; case F_DUPFD: err = f_dupfd(argi, filp, 0); break; @@ -463,6 +472,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, static int check_fcntl_cmd(unsigned cmd) { switch (cmd) { + case F_CREATED_QUERY: case F_DUPFD: case F_DUPFD_CLOEXEC: case F_DUPFD_QUERY: diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index c0bcc185fa48..e55a3314bcb0 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -16,6 +16,9 @@ #define F_DUPFD_QUERY (F_LINUX_SPECIFIC_BASE + 3) +/* Was the file just created? */ +#define F_CREATED_QUERY (F_LINUX_SPECIFIC_BASE + 4) + /* * Cancel a blocking posix lock; internal use only until we expose an * asynchronous lock api to userspace: From d0fe8920cbe42547798fd806f078eeaaba05df18 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 24 Jul 2024 15:15:36 +0200 Subject: [PATCH 089/168] selftests: add F_CREATED_QUERY tests Add simple selftests for fcntl(fd, F_CREATED_QUERY, 0). Link: https://lore.kernel.org/r/20240724-work-fcntl-v1-2-e8153a2f1991@kernel.org Signed-off-by: Christian Brauner --- .../testing/selftests/core/close_range_test.c | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c index 12b4eb9d0434..e0d9851fe1c9 100644 --- a/tools/testing/selftests/core/close_range_test.c +++ b/tools/testing/selftests/core/close_range_test.c @@ -26,6 +26,10 @@ #define F_DUPFD_QUERY (F_LINUX_SPECIFIC_BASE + 3) #endif +#ifndef F_CREATED_QUERY +#define F_CREATED_QUERY (F_LINUX_SPECIFIC_BASE + 4) +#endif + static inline int sys_close_range(unsigned int fd, unsigned int max_fd, unsigned int flags) { @@ -624,4 +628,39 @@ TEST(close_range_bitmap_corruption) EXPECT_EQ(0, WEXITSTATUS(status)); } +TEST(fcntl_created) +{ + for (int i = 0; i < 101; i++) { + int fd; + char path[PATH_MAX]; + + fd = open("/dev/null", O_RDONLY | O_CLOEXEC); + ASSERT_GE(fd, 0) { + if (errno == ENOENT) + SKIP(return, + "Skipping test since /dev/null does not exist"); + } + + /* We didn't create "/dev/null". */ + EXPECT_EQ(fcntl(fd, F_CREATED_QUERY, 0), 0); + close(fd); + + sprintf(path, "aaaa_%d", i); + fd = open(path, O_CREAT | O_RDONLY | O_CLOEXEC, 0600); + ASSERT_GE(fd, 0); + + /* We created "aaaa_%d". */ + EXPECT_EQ(fcntl(fd, F_CREATED_QUERY, 0), 1); + close(fd); + + fd = open(path, O_RDONLY | O_CLOEXEC); + ASSERT_GE(fd, 0); + + /* We're opening it again, so no positive creation check. */ + EXPECT_EQ(fcntl(fd, F_CREATED_QUERY, 0), 0); + close(fd); + unlink(path); + } +} + TEST_HARNESS_MAIN From e02bdb7208e7b5432cbe68392ae062810a7649fb Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 25 Jul 2024 15:31:24 +0200 Subject: [PATCH 090/168] Merge patch series "Add an fcntl() to check file creation" Christian Brauner says: Systemd has a helper called openat_report_new() that returns whether a file was created anew or it already existed before for cases where O_CREAT has to be used without O_EXCL (cf. [1]). That apparently isn't something that's specific to systemd but it's where I noticed it. The current logic is that it first attempts to open the file without O_CREAT | O_EXCL and if it gets ENOENT the helper tries again with both flags. If that succeeds all is well. If it now reports EEXIST it retries. That works fairly well but some corner cases make this more involved. If this operates on a dangling symlink the first openat() without O_CREAT | O_EXCL will return ENOENT but the second openat() with O_CREAT | O_EXCL will fail with EEXIST. The reason is that openat() without O_CREAT | O_EXCL follows the symlink while O_CREAT | O_EXCL doesn't for security reasons. So it's not something we can really change unless we add an explicit opt-in via O_FOLLOW which seems really ugly. The caller could try and use fanotify() to register to listen for creation events in the directory before calling openat(). The caller could then compare the returned tid to its own tid to ensure that even in threaded environments it actually created the file. That might work but is a lot of work for something that should be fairly simple and I'm uncertain about it's reliability. The caller could use a bpf lsm hook to hook into security_file_open() to figure out whether they created the file. That also seems a bit wild. So let's add F_CREATED_QUERY which allows the caller to check whether they actually did create the file. That has caveats of course but I don't think they are problematic: * In multi-threaded environments a thread can only be sure that it did create the file if it calls openat() with O_CREAT. In other words, it's obviously not enough to just go through it's fdtable and check these fds because another thread could've created the file. * If there's any codepaths where an openat() with O_CREAT would yield the same struct file as that of another thread it would obviously cause wrong results. I'm not aware of any such codepaths from openat() itself. Imho, that would be a bug. * Related to the previous point, calling the new fcntl() on files created and opened via special-purpose system calls or ioctl()s would cause wrong results only if the affected subsystem a) raises FMODE_CREATED and b) may return the same struct file for two different calls. I'm not seeing anything outside of regular VFS code that raises FMODE_CREATED. There is code for b) in e.g., the drm layer where the same struct file is resurfaced but again FMODE_CREATED isn't used and it would be very misleading if it did. [1]: https://github.com/systemd/systemd/blob/11d5e2b5fbf9f6bfa5763fd45b56829ad4f0777f/src/basic/fs-util.c#L1078 * patches from https://lore.kernel.org/r/20240724-work-fcntl-v1-0-e8153a2f1991@kernel.org: selftests: add F_CREATED_QUERY tests fcntl: add F_CREATED_QUERY Signed-off-by: Christian Brauner From c393eaa85349941badf3ce5f087400dbaf3bbe02 Mon Sep 17 00:00:00 2001 From: Haifeng Xu Date: Fri, 26 Jul 2024 11:05:25 +0800 Subject: [PATCH 091/168] fs: don't flush in-flight wb switches for superblocks without cgroup writeback When deactivating any type of superblock, it had to wait for the in-flight wb switches to be completed. wb switches are executed in inode_switch_wbs_work_fn() which needs to acquire the wb_switch_rwsem and races against sync_inodes_sb(). If there are too much dirty data in the superblock, the waiting time may increase significantly. For superblocks without cgroup writeback such as tmpfs, they have nothing to do with the wb swithes, so the flushing can be avoided. Signed-off-by: Haifeng Xu Link: https://lore.kernel.org/r/20240726030525.180330-1-haifeng.xu@shopee.com Reviewed-by: Jan Kara Suggested-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 7 ++++++- fs/super.c | 2 +- include/linux/writeback.h | 4 ++-- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index b865a3fa52f3..4451ecff37c4 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1132,6 +1132,7 @@ out_bdi_put: /** * cgroup_writeback_umount - flush inode wb switches for umount + * @sb: target super_block * * This function is called when a super_block is about to be destroyed and * flushes in-flight inode wb switches. An inode wb switch goes through @@ -1140,8 +1141,12 @@ out_bdi_put: * rare occurrences and synchronize_rcu() can take a while, perform * flushing iff wb switches are in flight. */ -void cgroup_writeback_umount(void) +void cgroup_writeback_umount(struct super_block *sb) { + + if (!(sb->s_bdi->capabilities & BDI_CAP_WRITEBACK)) + return; + /* * SB_ACTIVE should be reliably cleared before checking * isw_nr_in_flight, see generic_shutdown_super(). diff --git a/fs/super.c b/fs/super.c index 38d72a3cf6fc..28f42502bdf9 100644 --- a/fs/super.c +++ b/fs/super.c @@ -621,7 +621,7 @@ void generic_shutdown_super(struct super_block *sb) sync_filesystem(sb); sb->s_flags &= ~SB_ACTIVE; - cgroup_writeback_umount(); + cgroup_writeback_umount(sb); /* Evict all inodes with zero refcount. */ evict_inodes(sb); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 1a54676d843a..56b85841ae4c 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -217,7 +217,7 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, size_t bytes); int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, enum wb_reason reason, struct wb_completion *done); -void cgroup_writeback_umount(void); +void cgroup_writeback_umount(struct super_block *sb); bool cleanup_offline_cgwb(struct bdi_writeback *wb); /** @@ -324,7 +324,7 @@ static inline void wbc_account_cgroup_owner(struct writeback_control *wbc, { } -static inline void cgroup_writeback_umount(void) +static inline void cgroup_writeback_umount(struct super_block *sb) { } From 160210cea0356d31ffaf64b63a3ebeb26c7c0cd4 Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Mon, 3 Jun 2024 09:48:34 +0800 Subject: [PATCH 092/168] fs/direct-io: Remove linux/prefetch.h include After commit c22198e78d52 ("direct-io: remove random prefetches"), Nothing in this file needs anything from `linux/prefetch.h`. Signed-off-by: Youling Tang Link: https://lore.kernel.org/r/20240603014834.45294-1-youling.tang@linux.dev Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/direct-io.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index b0aafe640fa4..bbd05f1a2145 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -37,7 +37,6 @@ #include #include #include -#include #include "internal.h" @@ -1121,11 +1120,6 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct blk_plug plug; unsigned long align = offset | iov_iter_alignment(iter); - /* - * Avoid references to bdev if not absolutely needed to give - * the early prefetch in the caller enough time. - */ - /* watch out for a 0 len io from a tricksy fs */ if (iov_iter_rw(iter) == READ && !count) return 0; From de7007f27d9108fb00d801a6be8aae2ce4dd6101 Mon Sep 17 00:00:00 2001 From: Mohit0404 Date: Sat, 27 Jul 2024 12:51:34 +0530 Subject: [PATCH 093/168] Fixed: fs: file_table_c: Missing blank line warnings and struct declaration improved Fixed- WARNING: Missing a blank line after declarations WARNING: Missing a blank line after declarations Declaration format: improved struct file declaration format Signed-off-by: Mohit0404 Link: https://lore.kernel.org/r/20240727072134.130962-2-mohitpawar@mitaoe.ac.in Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/file_table.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/file_table.c b/fs/file_table.c index ca7843dde56d..35f2d5d9ca76 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -136,6 +136,7 @@ static int __init init_fs_stat_sysctls(void) register_sysctl_init("fs", fs_stat_sysctls); if (IS_ENABLED(CONFIG_BINFMT_MISC)) { struct ctl_table_header *hdr; + hdr = register_sysctl_mount_point("fs/binfmt_misc"); kmemleak_not_leak(hdr); } @@ -383,7 +384,9 @@ EXPORT_SYMBOL_GPL(alloc_file_pseudo_noaccount); struct file *alloc_file_clone(struct file *base, int flags, const struct file_operations *fops) { - struct file *f = alloc_file(&base->f_path, flags, fops); + struct file *f; + + f = alloc_file(&base->f_path, flags, fops); if (!IS_ERR(f)) { path_get(&f->f_path); f->f_mapping = base->f_mapping; From 4bcda1eaf184e308f07f9c61d3a535f9ce477ce8 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Tue, 30 Jul 2024 10:58:13 +0200 Subject: [PATCH 094/168] mount: handle OOM on mnt_warn_timestamp_expiry If no page could be allocated, an error pointer was used as format string in pr_warn. Rearrange the code to return early in case of OOM. Also add a check for the return value of d_path. Fixes: f8b92ba67c5d ("mount: Add mount warning for impending timestamp expiry") Signed-off-by: Olaf Hering Link: https://lore.kernel.org/r/20240730085856.32385-1-olaf@aepfle.de [brauner: rewrite commit and commit message] Signed-off-by: Christian Brauner --- fs/namespace.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 328087a4df8a..155c6abda71d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2921,8 +2921,15 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount * if (!__mnt_is_readonly(mnt) && (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) && (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) { - char *buf = (char *)__get_free_page(GFP_KERNEL); - char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM); + char *buf, *mntpath; + + buf = (char *)__get_free_page(GFP_KERNEL); + if (buf) + mntpath = d_path(mountpoint, buf, PAGE_SIZE); + else + mntpath = ERR_PTR(-ENOMEM); + if (IS_ERR(mntpath)) + mntpath = "(unknown)"; pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n", sb->s_type->name, @@ -2930,8 +2937,9 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount * mntpath, &sb->s_time_max, (unsigned long long)sb->s_time_max); - free_page((unsigned long)buf); sb->s_iflags |= SB_I_TS_EXPIRY_WARNED; + if (buf) + free_page((unsigned long)buf); } } From 6c203968f5af720ee2f4a8a17734e40aabe7b96b Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 30 Jul 2024 14:18:34 +0200 Subject: [PATCH 095/168] fs/aio: Fix __percpu annotation of *cpu pointer in struct kioctx __percpu annotation of *cpu pointer in struct kioctx is put at the wrong place, resulting in several sparse warnings: aio.c:623:24: warning: incorrect type in argument 1 (different address spaces) aio.c:623:24: expected void [noderef] __percpu *__pdata aio.c:623:24: got struct kioctx_cpu *cpu aio.c:788:18: warning: incorrect type in assignment (different address spaces) aio.c:788:18: expected struct kioctx_cpu *cpu aio.c:788:18: got struct kioctx_cpu [noderef] __percpu * aio.c:835:24: warning: incorrect type in argument 1 (different address spaces) aio.c:835:24: expected void [noderef] __percpu *__pdata aio.c:835:24: got struct kioctx_cpu *cpu aio.c:940:16: warning: incorrect type in initializer (different address spaces) aio.c:940:16: expected void const [noderef] __percpu *__vpp_verify aio.c:940:16: got struct kioctx_cpu * aio.c:958:16: warning: incorrect type in initializer (different address spaces) aio.c:958:16: expected void const [noderef] __percpu *__vpp_verify aio.c:958:16: got struct kioctx_cpu * Put __percpu annotation at the right place to fix these warnings. Signed-off-by: Uros Bizjak Link: https://lore.kernel.org/r/20240730121915.4514-1-ubizjak@gmail.com Reviewed-by: Jan Kara Cc: Benjamin LaHaise Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: Andrew Morton Signed-off-by: Christian Brauner --- fs/aio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/aio.c b/fs/aio.c index 6066f64967b3..e8920178b50f 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -100,7 +100,7 @@ struct kioctx { unsigned long user_id; - struct __percpu kioctx_cpu *cpu; + struct kioctx_cpu __percpu *cpu; /* * For percpu reqs_available, number of slots we move to/from global From 6a64c5220c5df235448b846aeff3c0660d4cc83e Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Wed, 31 Jul 2024 23:10:27 +1000 Subject: [PATCH 096/168] autofs: fix missing fput for FSCONFIG_SET_FD If you pass an fd using FSCONFIG_SET_FD, autofs_parse_fd() "steals" the param->file and so the fs_context infrastructure will not do fput() for us. Fixes: e6ec453bd0f0 ("autofs: convert autofs to use the new mount api") Signed-off-by: Aleksa Sarai Link: https://lore.kernel.org/r/20240731-fsconfig-fsparam_fd-fixes-v2-1-e7c472224417@cyphar.com Signed-off-by: Christian Brauner --- fs/autofs/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index cf792d4de4f1..64faa6c51f60 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -172,8 +172,7 @@ static int autofs_parse_fd(struct fs_context *fc, struct autofs_sb_info *sbi, ret = autofs_check_pipe(pipe); if (ret < 0) { errorf(fc, "Invalid/unusable pipe"); - if (param->type != fs_value_is_file) - fput(pipe); + fput(pipe); return -EBADF; } From 66e5cfee6280a361c429b6a6bfd66044e4644e86 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Wed, 31 Jul 2024 23:10:28 +1000 Subject: [PATCH 097/168] coda: use param->file for FSCONFIG_SET_FD While the old code did support FSCONFIG_SET_FD, there's no need to re-get the file the fs_context infrastructure already grabbed for us. Signed-off-by: Aleksa Sarai Link: https://lore.kernel.org/r/20240731-fsconfig-fsparam_fd-fixes-v2-2-e7c472224417@cyphar.com Signed-off-by: Christian Brauner --- fs/coda/inode.c | 43 ++++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 6898dc621011..6896fce122e1 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -119,31 +119,43 @@ static const struct fs_parameter_spec coda_param_specs[] = { {} }; -static int coda_parse_fd(struct fs_context *fc, int fd) +static int coda_set_idx(struct fs_context *fc, struct file *file) { struct coda_fs_context *ctx = fc->fs_private; - struct fd f; struct inode *inode; int idx; - f = fdget(fd); - if (!f.file) - return -EBADF; - inode = file_inode(f.file); + inode = file_inode(file); if (!S_ISCHR(inode->i_mode) || imajor(inode) != CODA_PSDEV_MAJOR) { - fdput(f); - return invalf(fc, "code: Not coda psdev"); + return invalf(fc, "coda: Not coda psdev"); } - idx = iminor(inode); - fdput(f); - if (idx < 0 || idx >= MAX_CODADEVS) return invalf(fc, "coda: Bad minor number"); ctx->idx = idx; return 0; } +static int coda_parse_fd(struct fs_context *fc, struct fs_parameter *param, + struct fs_parse_result *result) +{ + struct file *file; + int err; + + if (param->type == fs_value_is_file) { + file = param->file; + param->file = NULL; + } else { + file = fget(result->uint_32); + } + if (!file) + return -EBADF; + + err = coda_set_idx(fc, file); + fput(file); + return err; +} + static int coda_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct fs_parse_result result; @@ -155,7 +167,7 @@ static int coda_parse_param(struct fs_context *fc, struct fs_parameter *param) switch (opt) { case Opt_fd: - return coda_parse_fd(fc, result.uint_32); + return coda_parse_fd(fc, param, &result); } return 0; @@ -167,6 +179,7 @@ static int coda_parse_param(struct fs_context *fc, struct fs_parameter *param) */ static int coda_parse_monolithic(struct fs_context *fc, void *_data) { + struct file *file; struct coda_mount_data *data = _data; if (!data) @@ -175,7 +188,11 @@ static int coda_parse_monolithic(struct fs_context *fc, void *_data) if (data->version != CODA_MOUNT_VERSION) return invalf(fc, "coda: Bad mount version"); - coda_parse_fd(fc, data->fd); + file = fget(data->fd); + if (file) { + coda_set_idx(fc, file); + fput(file); + } return 0; } From c01a5d89e5c8abe638107be2a4ea9e4c7fcdd7f6 Mon Sep 17 00:00:00 2001 From: Wang Long Date: Fri, 2 Aug 2024 17:19:01 +0800 Subject: [PATCH 098/168] percpu-rwsem: remove the unused parameter 'read' In the function percpu_rwsem_release, the parameter `read` is unused, so remove it. Signed-off-by: Wang Long Link: https://lore.kernel.org/r/20240802091901.2546797-1-w@laoqinren.net Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/super.c | 2 +- include/linux/fs.h | 2 +- include/linux/percpu-rwsem.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/super.c b/fs/super.c index 28f42502bdf9..33f35abb1990 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1905,7 +1905,7 @@ static void lockdep_sb_freeze_release(struct super_block *sb) int level; for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) - percpu_rwsem_release(sb->s_writers.rw_sem + level, 0, _THIS_IP_); + percpu_rwsem_release(sb->s_writers.rw_sem + level, _THIS_IP_); } /* diff --git a/include/linux/fs.h b/include/linux/fs.h index fb0426f349fc..c0286d479c9d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1683,7 +1683,7 @@ static inline bool __sb_start_write_trylock(struct super_block *sb, int level) #define __sb_writers_acquired(sb, lev) \ percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) #define __sb_writers_release(sb, lev) \ - percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) + percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], _THIS_IP_) /** * __sb_write_started - check if sb freeze level is held diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 36b942b67b7d..c012df33a9f0 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -145,7 +145,7 @@ extern void percpu_free_rwsem(struct percpu_rw_semaphore *); #define percpu_rwsem_assert_held(sem) lockdep_assert_held(sem) static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem, - bool read, unsigned long ip) + unsigned long ip) { lock_release(&sem->dep_map, ip); } From ce26c4fc98757299894f802b470daf2aae7f18b6 Mon Sep 17 00:00:00 2001 From: Adrian Ratiu Date: Fri, 2 Aug 2024 11:02:25 +0300 Subject: [PATCH 099/168] proc: add config & param to block forcing mem writes This adds a Kconfig option and boot param to allow removing the FOLL_FORCE flag from /proc/pid/mem write calls because it can be abused. The traditional forcing behavior is kept as default because it can break GDB and some other use cases. Previously we tried a more sophisticated approach allowing distributions to fine-tune /proc/pid/mem behavior, however that got NAK-ed by Linus [1], who prefers this simpler approach with semantics also easier to understand for users. Link: https://lore.kernel.org/lkml/CAHk-=wiGWLChxYmUA5HrT5aopZrB7_2VTa0NLZcxORgkUe5tEQ@mail.gmail.com/ [1] Cc: Doug Anderson Cc: Jeff Xu Cc: Jann Horn Cc: Kees Cook Cc: Ard Biesheuvel Cc: Christian Brauner Suggested-by: Linus Torvalds Signed-off-by: Linus Torvalds Signed-off-by: Adrian Ratiu Link: https://lore.kernel.org/r/20240802080225.89408-1-adrian.ratiu@collabora.com Signed-off-by: Christian Brauner --- .../admin-guide/kernel-parameters.txt | 10 +++ fs/proc/base.c | 61 ++++++++++++++++++- security/Kconfig | 32 ++++++++++ 3 files changed, 102 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 09126bb8cc9f..be010fec7654 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4788,6 +4788,16 @@ printk.time= Show timing data prefixed to each printk message line Format: (1/Y/y=enable, 0/N/n=disable) + proc_mem.force_override= [KNL] + Format: {always | ptrace | never} + Traditionally /proc/pid/mem allows memory permissions to be + overridden without restrictions. This option may be set to + restrict that. Can be one of: + - 'always': traditional behavior always allows mem overrides. + - 'ptrace': only allow mem overrides for active ptracers. + - 'never': never allow mem overrides. + If not specified, default is the CONFIG_PROC_MEM_* choice. + processor.max_cstate= [HW,ACPI] Limit processor to maximum C-state max_cstate=9 overrides any DMI blacklist limit. diff --git a/fs/proc/base.c b/fs/proc/base.c index 72a1acd03675..f389c69767fa 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -85,6 +85,7 @@ #include #include #include +#include #include #include #include @@ -117,6 +118,40 @@ static u8 nlink_tid __ro_after_init; static u8 nlink_tgid __ro_after_init; +enum proc_mem_force { + PROC_MEM_FORCE_ALWAYS, + PROC_MEM_FORCE_PTRACE, + PROC_MEM_FORCE_NEVER +}; + +static enum proc_mem_force proc_mem_force_override __ro_after_init = + IS_ENABLED(CONFIG_PROC_MEM_NO_FORCE) ? PROC_MEM_FORCE_NEVER : + IS_ENABLED(CONFIG_PROC_MEM_FORCE_PTRACE) ? PROC_MEM_FORCE_PTRACE : + PROC_MEM_FORCE_ALWAYS; + +static const struct constant_table proc_mem_force_table[] __initconst = { + { "always", PROC_MEM_FORCE_ALWAYS }, + { "ptrace", PROC_MEM_FORCE_PTRACE }, + { "never", PROC_MEM_FORCE_NEVER }, + { } +}; + +static int __init early_proc_mem_force_override(char *buf) +{ + if (!buf) + return -EINVAL; + + /* + * lookup_constant() defaults to proc_mem_force_override to preseve + * the initial Kconfig choice in case an invalid param gets passed. + */ + proc_mem_force_override = lookup_constant(proc_mem_force_table, + buf, proc_mem_force_override); + + return 0; +} +early_param("proc_mem.force_override", early_proc_mem_force_override); + struct pid_entry { const char *name; unsigned int len; @@ -835,6 +870,28 @@ static int mem_open(struct inode *inode, struct file *file) return ret; } +static bool proc_mem_foll_force(struct file *file, struct mm_struct *mm) +{ + struct task_struct *task; + bool ptrace_active = false; + + switch (proc_mem_force_override) { + case PROC_MEM_FORCE_NEVER: + return false; + case PROC_MEM_FORCE_PTRACE: + task = get_proc_task(file_inode(file)); + if (task) { + ptrace_active = READ_ONCE(task->ptrace) && + READ_ONCE(task->mm) == mm && + READ_ONCE(task->parent) == current; + put_task_struct(task); + } + return ptrace_active; + default: + return true; + } +} + static ssize_t mem_rw(struct file *file, char __user *buf, size_t count, loff_t *ppos, int write) { @@ -855,7 +912,9 @@ static ssize_t mem_rw(struct file *file, char __user *buf, if (!mmget_not_zero(mm)) goto free; - flags = FOLL_FORCE | (write ? FOLL_WRITE : 0); + flags = write ? FOLL_WRITE : 0; + if (proc_mem_foll_force(file, mm)) + flags |= FOLL_FORCE; while (count > 0) { size_t this_len = min_t(size_t, count, PAGE_SIZE); diff --git a/security/Kconfig b/security/Kconfig index 412e76f1575d..a93c1a9b7c28 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -19,6 +19,38 @@ config SECURITY_DMESG_RESTRICT If you are unsure how to answer this question, answer N. +choice + prompt "Allow /proc/pid/mem access override" + default PROC_MEM_ALWAYS_FORCE + help + Traditionally /proc/pid/mem allows users to override memory + permissions for users like ptrace, assuming they have ptrace + capability. + + This allows people to limit that - either never override, or + require actual active ptrace attachment. + + Defaults to the traditional behavior (for now) + +config PROC_MEM_ALWAYS_FORCE + bool "Traditional /proc/pid/mem behavior" + help + This allows /proc/pid/mem accesses to override memory mapping + permissions if you have ptrace access rights. + +config PROC_MEM_FORCE_PTRACE + bool "Require active ptrace() use for access override" + help + This allows /proc/pid/mem accesses to override memory mapping + permissions for active ptracers like gdb. + +config PROC_MEM_NO_FORCE + bool "Never" + help + Never override memory mapping permissions + +endchoice + config SECURITY bool "Enable different security models" depends on SYSFS From 63101032df1271d3158331aeb15aef6b7a732657 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Sat, 3 Aug 2024 19:50:00 +0800 Subject: [PATCH 100/168] fs: mounts: Remove unused declaration mnt_cursor_del() Commit 2eea9ce4310d ("mounts: keep list of mounts in an rbtree") removed the implementation but leave declaration. Signed-off-by: Yue Haibing Link: https://lore.kernel.org/r/20240803115000.589872-1-yuehaibing@huawei.com Signed-off-by: Christian Brauner --- fs/mount.h | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/mount.h b/fs/mount.h index ad4b1ddebb54..0a78f85cf737 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -153,5 +153,4 @@ static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list) list_add_tail(&mnt->mnt_list, dt_list); } -extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor); bool has_locked_children(struct mount *mnt, struct dentry *dentry); From a3d4c573e6807a7bc26c8745bcdd3e1ea5b99878 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 2 Aug 2024 17:45:02 -0400 Subject: [PATCH 101/168] fs: remove comment about d_rcu_to_refcount This function no longer exists. Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20240802-openfast-v1-1-a1cff2a33063@kernel.org Signed-off-by: Christian Brauner --- fs/dcache.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 3d8daaecb6d1..1af75fa68638 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2163,9 +2163,6 @@ seqretry: * without taking d_lock and checking d_seq sequence count against @seq * returned here. * - * A refcount may be taken on the found dentry with the d_rcu_to_refcount - * function. - * * Alternatively, __d_lookup_rcu may be called again to look up the child of * the returned dentry, so long as its parent's seqlock is checked after the * child is looked up. Thus, an interlocking stepping of sequence lock checks From 2ce051550f4970deeff14d5e4dfd5c02844b7ef6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 2 Aug 2024 17:45:03 -0400 Subject: [PATCH 102/168] fs: add a kerneldoc header over lookup_fast The lookup_fast helper in fs/namei.c has some subtlety in how dentries are returned. Document them. Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20240802-openfast-v1-2-a1cff2a33063@kernel.org Signed-off-by: Christian Brauner --- fs/namei.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/namei.c b/fs/namei.c index 5512cb10fa89..c3459785704d 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1639,6 +1639,20 @@ struct dentry *lookup_one_qstr_excl(const struct qstr *name, } EXPORT_SYMBOL(lookup_one_qstr_excl); +/** + * lookup_fast - do fast lockless (but racy) lookup of a dentry + * @nd: current nameidata + * + * Do a fast, but racy lookup in the dcache for the given dentry, and + * revalidate it. Returns a valid dentry pointer or NULL if one wasn't + * found. On error, an ERR_PTR will be returned. + * + * If this function returns a valid dentry and the walk is no longer + * lazy, the dentry will carry a reference that must later be put. If + * RCU mode is still in force, then this is not the case and the dentry + * must be legitimized before use. If this returns NULL, then the walk + * will no longer be in RCU mode. + */ static struct dentry *lookup_fast(struct nameidata *nd) { struct dentry *dentry, *parent = nd->path.dentry; From 2258e22f05aff5865c93cbd4e9acba55b295d832 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Mon, 19 Aug 2024 00:58:44 +0100 Subject: [PATCH 103/168] Squashfs: Update page_actor to not use page->index This commit removes an unnecessary use of page->index, and moves the other use over to folio->index. Signed-off-by: Phillip Lougher Link: https://lore.kernel.org/r/20240818235847.170468-2-phillip@squashfs.org.uk Signed-off-by: Christian Brauner --- fs/squashfs/file.c | 2 +- fs/squashfs/file_direct.c | 3 ++- fs/squashfs/page_actor.c | 11 ++++++++--- fs/squashfs/page_actor.h | 3 ++- 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index a8c1e7f9a609..2b6b63f4ccd1 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -589,7 +589,7 @@ static void squashfs_readahead(struct readahead_control *ractl) goto skip_pages; actor = squashfs_page_actor_init_special(msblk, pages, nr_pages, - expected); + expected, start); if (!actor) goto skip_pages; diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index 2a689ce71de9..0586e6ba94bf 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -67,7 +67,8 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, * Create a "page actor" which will kmap and kunmap the * page cache pages appropriately within the decompressor */ - actor = squashfs_page_actor_init_special(msblk, page, pages, expected); + actor = squashfs_page_actor_init_special(msblk, page, pages, expected, + start_index << PAGE_SHIFT); if (actor == NULL) goto out; diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c index 81af6c4ca115..2b3e807d4dea 100644 --- a/fs/squashfs/page_actor.c +++ b/fs/squashfs/page_actor.c @@ -60,6 +60,11 @@ struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, } /* Implementation of page_actor for decompressing directly into page cache. */ +static loff_t page_next_index(struct squashfs_page_actor *actor) +{ + return page_folio(actor->page[actor->next_page])->index; +} + static void *handle_next_page(struct squashfs_page_actor *actor) { int max_pages = (actor->length + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -68,7 +73,7 @@ static void *handle_next_page(struct squashfs_page_actor *actor) return NULL; if ((actor->next_page == actor->pages) || - (actor->next_index != actor->page[actor->next_page]->index)) { + (actor->next_index != page_next_index(actor))) { actor->next_index++; actor->returned_pages++; actor->last_page = NULL; @@ -103,7 +108,7 @@ static void direct_finish_page(struct squashfs_page_actor *actor) } struct squashfs_page_actor *squashfs_page_actor_init_special(struct squashfs_sb_info *msblk, - struct page **page, int pages, int length) + struct page **page, int pages, int length, loff_t start_index) { struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); @@ -125,7 +130,7 @@ struct squashfs_page_actor *squashfs_page_actor_init_special(struct squashfs_sb_ actor->pages = pages; actor->next_page = 0; actor->returned_pages = 0; - actor->next_index = page[0]->index & ~((1 << (msblk->block_log - PAGE_SHIFT)) - 1); + actor->next_index = start_index >> PAGE_SHIFT; actor->pageaddr = NULL; actor->last_page = NULL; actor->alloc_buffer = msblk->decompressor->alloc_buffer; diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h index 97d4983559b1..c6d837f0e9ca 100644 --- a/fs/squashfs/page_actor.h +++ b/fs/squashfs/page_actor.h @@ -29,7 +29,8 @@ extern struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, int pages, int length); extern struct squashfs_page_actor *squashfs_page_actor_init_special( struct squashfs_sb_info *msblk, - struct page **page, int pages, int length); + struct page **page, int pages, int length, + loff_t start_index); static inline struct page *squashfs_page_actor_free(struct squashfs_page_actor *actor) { struct page *last_page = actor->last_page; From 6f09ffb1f4fadf156d5d1032217b9d1fa4e07dbe Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Mon, 19 Aug 2024 00:58:45 +0100 Subject: [PATCH 104/168] Squashfs: Update squashfs_readahead() to not use page->index This commit removes references to page->index in the pages returned from __readahead_batch(), and instead uses the 'start' variable. This does reveal a bug in the previous code in that 'start' was not updated every time around the loop. This is fixed in this commit. Signed-off-by: Phillip Lougher Link: https://lore.kernel.org/r/20240818235847.170468-3-phillip@squashfs.org.uk Signed-off-by: Christian Brauner --- fs/squashfs/file.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index 2b6b63f4ccd1..50fe5a078b83 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -551,7 +551,6 @@ static void squashfs_readahead(struct readahead_control *ractl) return; for (;;) { - pgoff_t index; int res, bsize; u64 block = 0; unsigned int expected; @@ -570,13 +569,8 @@ static void squashfs_readahead(struct readahead_control *ractl) if (readahead_pos(ractl) >= i_size_read(inode)) goto skip_pages; - index = pages[0]->index >> shift; - - if ((pages[nr_pages - 1]->index >> shift) != index) - goto skip_pages; - - if (index == file_end && squashfs_i(inode)->fragment_block != - SQUASHFS_INVALID_BLK) { + if (start >> msblk->block_log == file_end && + squashfs_i(inode)->fragment_block != SQUASHFS_INVALID_BLK) { res = squashfs_readahead_fragment(pages, nr_pages, expected); if (res) @@ -584,7 +578,7 @@ static void squashfs_readahead(struct readahead_control *ractl) continue; } - bsize = read_blocklist(inode, index, &block); + bsize = read_blocklist(inode, start >> msblk->block_log, &block); if (bsize == 0) goto skip_pages; @@ -602,7 +596,7 @@ static void squashfs_readahead(struct readahead_control *ractl) /* Last page (if present) may have trailing bytes not filled */ bytes = res % PAGE_SIZE; - if (index == file_end && bytes && last_page) + if (start >> msblk->block_log == file_end && bytes && last_page) memzero_page(last_page, bytes, PAGE_SIZE - bytes); @@ -616,6 +610,8 @@ static void squashfs_readahead(struct readahead_control *ractl) unlock_page(pages[i]); put_page(pages[i]); } + + start += readahead_batch_length(ractl); } kfree(pages); From 7f73fcde4d93072d4003fee0949359757e8d0e41 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Mon, 19 Aug 2024 00:58:46 +0100 Subject: [PATCH 105/168] Squashfs: Update squashfs_readpage_block() to not use page->index This commit replaces references to page->index to folio->index or their equivalent. Signed-off-by: Phillip Lougher Link: https://lore.kernel.org/r/20240818235847.170468-4-phillip@squashfs.org.uk Signed-off-by: Christian Brauner --- fs/squashfs/file_direct.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index 0586e6ba94bf..646d4d421f99 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -23,15 +23,15 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, int expected) { + struct folio *folio = page_folio(target_page); struct inode *inode = target_page->mapping->host; struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; - loff_t file_end = (i_size_read(inode) - 1) >> PAGE_SHIFT; int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; - loff_t start_index = target_page->index & ~mask; + loff_t start_index = folio->index & ~mask; loff_t end_index = start_index | mask; int i, n, pages, bytes, res = -ENOMEM; - struct page **page; + struct page **page, *last_page; struct squashfs_page_actor *actor; void *pageaddr; @@ -46,7 +46,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, /* Try to grab all the pages covered by the Squashfs block */ for (i = 0, n = start_index; n <= end_index; n++) { - page[i] = (n == target_page->index) ? target_page : + page[i] = (n == folio->index) ? target_page : grab_cache_page_nowait(target_page->mapping, n); if (page[i] == NULL) @@ -75,7 +75,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, /* Decompress directly into the page cache buffers */ res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor); - squashfs_page_actor_free(actor); + last_page = squashfs_page_actor_free(actor); if (res < 0) goto mark_errored; @@ -87,8 +87,8 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, /* Last page (if present) may have trailing bytes not filled */ bytes = res % PAGE_SIZE; - if (page[pages - 1]->index == end_index && bytes) { - pageaddr = kmap_local_page(page[pages - 1]); + if (end_index == file_end && last_page && bytes) { + pageaddr = kmap_local_page(last_page); memset(pageaddr + bytes, 0, PAGE_SIZE - bytes); kunmap_local(pageaddr); } From fd54fa6efe0dd3894d6fd703f8856675b4bf8315 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Mon, 19 Aug 2024 00:58:47 +0100 Subject: [PATCH 106/168] Squashfs: Rewrite and update squashfs_readahead_fragment() to not use page->index The previous implementation lacked error checking (e.g. the bytes returned by squashfs_fill_page() is not checked), and the use of page->index could not be removed without substantially rewriting the routine to use the page actor abstraction used elsewhere. Signed-off-by: Phillip Lougher Link: https://lore.kernel.org/r/20240818235847.170468-5-phillip@squashfs.org.uk Signed-off-by: Christian Brauner --- fs/squashfs/file.c | 66 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 16 deletions(-) diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index 50fe5a078b83..5a3745e52025 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -494,39 +494,73 @@ out: } static int squashfs_readahead_fragment(struct page **page, - unsigned int pages, unsigned int expected) + unsigned int pages, unsigned int expected, loff_t start) { struct inode *inode = page[0]->mapping->host; struct squashfs_cache_entry *buffer = squashfs_get_fragment(inode->i_sb, squashfs_i(inode)->fragment_block, squashfs_i(inode)->fragment_size); struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; - unsigned int n, mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; - int error = buffer->error; + int i, bytes, copied; + struct squashfs_page_actor *actor; + unsigned int offset; + void *addr; + struct page *last_page; - if (error) + if (buffer->error) goto out; - expected += squashfs_i(inode)->fragment_offset; + actor = squashfs_page_actor_init_special(msblk, page, pages, + expected, start); + if (!actor) + goto out; - for (n = 0; n < pages; n++) { - unsigned int base = (page[n]->index & mask) << PAGE_SHIFT; - unsigned int offset = base + squashfs_i(inode)->fragment_offset; + squashfs_actor_nobuff(actor); + addr = squashfs_first_page(actor); - if (expected > offset) { - unsigned int avail = min_t(unsigned int, expected - - offset, PAGE_SIZE); + for (copied = offset = 0; offset < expected; offset += PAGE_SIZE) { + int avail = min_t(int, expected - offset, PAGE_SIZE); - squashfs_fill_page(page[n], buffer, offset, avail); + if (!IS_ERR(addr)) { + bytes = squashfs_copy_data(addr, buffer, offset + + squashfs_i(inode)->fragment_offset, avail); + + if (bytes != avail) + goto failed; } - unlock_page(page[n]); - put_page(page[n]); + copied += avail; + addr = squashfs_next_page(actor); } + last_page = squashfs_page_actor_free(actor); + + if (copied == expected) { + /* Last page (if present) may have trailing bytes not filled */ + bytes = copied % PAGE_SIZE; + if (bytes && last_page) + memzero_page(last_page, bytes, PAGE_SIZE - bytes); + + for (i = 0; i < pages; i++) { + flush_dcache_page(page[i]); + SetPageUptodate(page[i]); + } + } + + for (i = 0; i < pages; i++) { + unlock_page(page[i]); + put_page(page[i]); + } + + squashfs_cache_put(buffer); + return 0; + +failed: + squashfs_page_actor_free(actor); + out: squashfs_cache_put(buffer); - return error; + return 1; } static void squashfs_readahead(struct readahead_control *ractl) @@ -572,7 +606,7 @@ static void squashfs_readahead(struct readahead_control *ractl) if (start >> msblk->block_log == file_end && squashfs_i(inode)->fragment_block != SQUASHFS_INVALID_BLK) { res = squashfs_readahead_fragment(pages, nr_pages, - expected); + expected, start); if (res) goto skip_pages; continue; From 16febd5eb3699c0fc1a27a3e60a69033a450ed61 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Mon, 5 Aug 2024 15:17:21 +0200 Subject: [PATCH 107/168] exec: don't WARN for racy path_noexec check Both i_mode and noexec checks wrapped in WARN_ON stem from an artifact of the previous implementation. They used to legitimately check for the condition, but that got moved up in two commits: 633fb6ac3980 ("exec: move S_ISREG() check earlier") 0fd338b2d2cd ("exec: move path_noexec() check earlier") Instead of being removed said checks are WARN_ON'ed instead, which has some debug value. However, the spurious path_noexec check is racy, resulting in unwarranted warnings should someone race with setting the noexec flag. One can note there is more to perm-checking whether execve is allowed and none of the conditions are guaranteed to still hold after they were tested for. Additionally this does not validate whether the code path did any perm checking to begin with -- it will pass if the inode happens to be regular. Keep the redundant path_noexec() check even though it's mindless nonsense checking for guarantee that isn't given so drop the WARN. Reword the commentary and do small tidy ups while here. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20240805131721.765484-1-mjguzik@gmail.com [brauner: keep redundant path_noexec() check] Signed-off-by: Christian Brauner --- fs/exec.c | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 50e76cc633c4..caae051c5a95 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -145,13 +145,11 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) goto out; /* - * may_open() has already checked for this, so it should be - * impossible to trip now. But we need to be extra cautious - * and check again at the very end too. + * Check do_open_execat() for an explanation. */ error = -EACCES; - if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) || - path_noexec(&file->f_path))) + if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) || + path_noexec(&file->f_path)) goto exit; error = -ENOEXEC; @@ -954,7 +952,6 @@ EXPORT_SYMBOL(transfer_args_to_stack); static struct file *do_open_execat(int fd, struct filename *name, int flags) { struct file *file; - int err; struct open_flags open_exec_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_EXEC, @@ -971,24 +968,20 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags) file = do_filp_open(fd, name, &open_exec_flags); if (IS_ERR(file)) - goto out; + return file; /* - * may_open() has already checked for this, so it should be - * impossible to trip now. But we need to be extra cautious - * and check again at the very end too. + * In the past the regular type check was here. It moved to may_open() in + * 633fb6ac3980 ("exec: move S_ISREG() check earlier"). Since then it is + * an invariant that all non-regular files error out before we get here. */ - err = -EACCES; - if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) || - path_noexec(&file->f_path))) - goto exit; + if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) || + path_noexec(&file->f_path)) { + fput(file); + return ERR_PTR(-EACCES); + } -out: return file; - -exit: - fput(file); - return ERR_PTR(err); } /** From 196de44f932a6e24c161a4a179e74c9764ffcd9e Mon Sep 17 00:00:00 2001 From: Yuesong Li Date: Tue, 6 Aug 2024 11:47:10 +0800 Subject: [PATCH 108/168] fs/namespace.c: Fix typo in comment replace 'permanetly' with 'permanently' in the comment & replace 'propogated' with 'propagated' in the comment Signed-off-by: Yuesong Li Link: https://lore.kernel.org/r/20240806034710.2807788-1-liyuesong@vivo.com Signed-off-by: Christian Brauner --- fs/namespace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 155c6abda71d..5f2dddee0074 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1774,7 +1774,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) list_del_init(&p->mnt_child); } - /* Add propogated mounts to the tmp_list */ + /* Add propagated mounts to the tmp_list */ if (how & UMOUNT_PROPAGATE) propagate_umount(&tmp_list); @@ -5613,7 +5613,7 @@ static bool mnt_already_visible(struct mnt_namespace *ns, /* Only worry about locked mounts */ if (!(child->mnt.mnt_flags & MNT_LOCKED)) continue; - /* Is the directory permanetly empty? */ + /* Is the directory permanently empty? */ if (!is_empty_dir_inode(inode)) goto next; } From 08d313b4ec7d81a7a082a44bb3f99085eb0af5a1 Mon Sep 17 00:00:00 2001 From: Joel Savitz Date: Fri, 2 Aug 2024 22:54:55 -0400 Subject: [PATCH 109/168] file: remove outdated comment after close_fd() Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: linux-fsdevel@vger.kernel.org The comment on EXPORT_SYMBOL(close_fd) was added in commit 2ca2a09d6215 ("fs: add ksys_close() wrapper; remove in-kernel calls to sys_close()"), before commit 8760c909f54a ("file: Rename __close_fd to close_fd and remove the files parameter") gave the function its current name, however commit 1572bfdf21d4 ("file: Replace ksys_close with close_fd") removes the referenced caller entirely, obsoleting this comment. Signed-off-by: Joel Savitz Link: https://lore.kernel.org/r/20240803025455.239276-1-jsavitz@redhat.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/file.c b/fs/file.c index 655338effe9c..976ecd4ce2c6 100644 --- a/fs/file.c +++ b/fs/file.c @@ -672,7 +672,7 @@ int close_fd(unsigned fd) return filp_close(file, files); } -EXPORT_SYMBOL(close_fd); /* for ksys_close() */ +EXPORT_SYMBOL(close_fd); /** * last_fd - return last valid index into fd table From 538863bad22410c8ae0c99601a50d7a334fa9551 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 6 Aug 2024 19:28:46 +0200 Subject: [PATCH 110/168] vfs: dodge smp_mb in break_lease and break_deleg in the common case These inlines show up in the fast path (e.g., in do_dentry_open()) and induce said full barrier regarding i_flctx access when in most cases the pointer is NULL. The pointer can be safely checked before issuing the barrier, dodging it in most cases as a result. It is plausible the consume fence would be sufficient, but I don't want to go audit all callers regarding what they before calling here. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20240806172846.886570-1-mjguzik@gmail.com Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/filelock.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/include/linux/filelock.h b/include/linux/filelock.h index daee999d05f3..bb44224c6676 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -420,28 +420,38 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) #ifdef CONFIG_FILE_LOCKING static inline int break_lease(struct inode *inode, unsigned int mode) { + struct file_lock_context *flctx; + /* * Since this check is lockless, we must ensure that any refcounts * taken are done before checking i_flctx->flc_lease. Otherwise, we * could end up racing with tasks trying to set a new lease on this * file. */ + flctx = READ_ONCE(inode->i_flctx); + if (!flctx) + return 0; smp_mb(); - if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) + if (!list_empty_careful(&flctx->flc_lease)) return __break_lease(inode, mode, FL_LEASE); return 0; } static inline int break_deleg(struct inode *inode, unsigned int mode) { + struct file_lock_context *flctx; + /* * Since this check is lockless, we must ensure that any refcounts * taken are done before checking i_flctx->flc_lease. Otherwise, we * could end up racing with tasks trying to set a new lease on this * file. */ + flctx = READ_ONCE(inode->i_flctx); + if (!flctx) + return 0; smp_mb(); - if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) + if (!list_empty_careful(&flctx->flc_lease)) return __break_lease(inode, mode, FL_DELEG); return 0; } From 2c75063473d2758120b3d4c2da3e6efbf1654381 Mon Sep 17 00:00:00 2001 From: Xiaxi Shen Date: Wed, 7 Aug 2024 00:05:36 -0700 Subject: [PATCH 111/168] Fix spelling and gramatical errors Fixed 3 typos in design.rst Signed-off-by: Xiaxi Shen Link: https://lore.kernel.org/r/20240807070536.14536-1-shenxiaxi26@gmail.com Reviewed-by: Carlos Maiolino Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- Documentation/filesystems/iomap/design.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/filesystems/iomap/design.rst b/Documentation/filesystems/iomap/design.rst index f8ee3427bc1a..37594e1c5914 100644 --- a/Documentation/filesystems/iomap/design.rst +++ b/Documentation/filesystems/iomap/design.rst @@ -142,9 +142,9 @@ Definitions * **pure overwrite**: A write operation that does not require any metadata or zeroing operations to perform during either submission or completion. - This implies that the fileystem must have already allocated space + This implies that the filesystem must have already allocated space on disk as ``IOMAP_MAPPED`` and the filesystem must not place any - constaints on IO alignment or size. + constraints on IO alignment or size. The only constraints on I/O alignment are device level (minimum I/O size and alignment, typically sector size). @@ -394,7 +394,7 @@ iomap is concerned: * The **upper** level primitive is provided by the filesystem to coordinate access to different iomap operations. - The exact primitive is specifc to the filesystem and operation, + The exact primitive is specific to the filesystem and operation, but is often a VFS inode, pagecache invalidation, or folio lock. For example, a filesystem might take ``i_rwsem`` before calling ``iomap_file_buffered_write`` and ``iomap_file_unshare`` to prevent From 4eb76c5d9a8851735fd3ec5833ecf412e8921655 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 7 Aug 2024 10:52:31 +0000 Subject: [PATCH 112/168] eventpoll: Don't re-zero eventpoll fields Remove redundant and unnecessary code. ep_alloc uses kzalloc to create struct eventpoll, so there is no need to set fields to defaults of 0. This was accidentally introduced in commit 85455c795c07 ("eventpoll: support busy poll per epoll instance") and expanded on in follow-up commits. Signed-off-by: Joe Damato Link: https://lore.kernel.org/r/20240807105231.179158-1-jdamato@fastly.com Reviewed-by: Martin Karsten Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/eventpoll.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index f53ca4f7fced..6c0a1e9715ea 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -2200,11 +2200,6 @@ static int do_epoll_create(int flags) error = PTR_ERR(file); goto out_free_fd; } -#ifdef CONFIG_NET_RX_BUSY_POLL - ep->busy_poll_usecs = 0; - ep->busy_poll_budget = 0; - ep->prefer_busy_poll = false; -#endif ep->file = file; fd_install(fd, file); return fd; From 5c72f039d0ede1b73d62281793f80a984beb4b4a Mon Sep 17 00:00:00 2001 From: Martin Karsten Date: Tue, 6 Aug 2024 12:33:01 +0000 Subject: [PATCH 113/168] eventpoll: Annotate data-race of busy_poll_usecs A struct eventpoll's busy_poll_usecs field can be modified via a user ioctl at any time. All reads of this field should be annotated with READ_ONCE. Fixes: 85455c795c07 ("eventpoll: support busy poll per epoll instance") Cc: stable@vger.kernel.org Signed-off-by: Martin Karsten Link: https://lore.kernel.org/r/20240806123301.167557-1-jdamato@fastly.com Reviewed-by: Joe Damato Signed-off-by: Christian Brauner --- fs/eventpoll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 6c0a1e9715ea..145f5349c612 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -420,7 +420,7 @@ static bool busy_loop_ep_timeout(unsigned long start_time, static bool ep_busy_loop_on(struct eventpoll *ep) { - return !!ep->busy_poll_usecs || net_busy_loop_on(); + return !!READ_ONCE(ep->busy_poll_usecs) || net_busy_loop_on(); } static bool ep_busy_loop_end(void *p, unsigned long start_time) From 428942b6588e3bd02ea30e3419379cc248555bf0 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 7 Aug 2024 08:10:27 -0400 Subject: [PATCH 114/168] fs: try an opportunistic lookup for O_CREAT opens too Today, when opening a file we'll typically do a fast lookup, but if O_CREAT is set, the kernel always takes the exclusive inode lock. I assume this was done with the expectation that O_CREAT means that we always expect to do the create, but that's often not the case. Many programs set O_CREAT even in scenarios where the file already exists. This patch rearranges the pathwalk-for-open code to also attempt a fast_lookup in certain O_CREAT cases. If a positive dentry is found, the inode_lock can be avoided altogether, and if auditing isn't enabled, it can stay in rcuwalk mode for the last step_into. One notable exception that is hopefully temporary: if we're doing an rcuwalk and auditing is enabled, skip the lookup_fast. Legitimizing the dentry in that case is more expensive than taking the i_rwsem for now. Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20240807-openfast-v3-1-040d132d2559@kernel.org Reviewed-by: Jan Kara Reviewed-by: Josef Bacik Signed-off-by: Christian Brauner --- fs/namei.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 64 insertions(+), 10 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index c3459785704d..3e34f4d97d83 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3605,6 +3605,49 @@ out_dput: return ERR_PTR(error); } +static inline bool trailing_slashes(struct nameidata *nd) +{ + return (bool)nd->last.name[nd->last.len]; +} + +static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag) +{ + struct dentry *dentry; + + if (open_flag & O_CREAT) { + /* Don't bother on an O_EXCL create */ + if (open_flag & O_EXCL) + return NULL; + + /* + * FIXME: If auditing is enabled, then we'll have to unlazy to + * use the dentry. For now, don't do this, since it shifts + * contention from parent's i_rwsem to its d_lockref spinlock. + * Reconsider this once dentry refcounting handles heavy + * contention better. + */ + if ((nd->flags & LOOKUP_RCU) && !audit_dummy_context()) + return NULL; + } + + if (trailing_slashes(nd)) + nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; + + dentry = lookup_fast(nd); + if (IS_ERR_OR_NULL(dentry)) + return dentry; + + if (open_flag & O_CREAT) { + /* Discard negative dentries. Need inode_lock to do the create */ + if (!dentry->d_inode) { + if (!(nd->flags & LOOKUP_RCU)) + dput(dentry); + dentry = NULL; + } + } + return dentry; +} + static const char *open_last_lookups(struct nameidata *nd, struct file *file, const struct open_flags *op) { @@ -3622,28 +3665,39 @@ static const char *open_last_lookups(struct nameidata *nd, return handle_dots(nd, nd->last_type); } + /* We _can_ be in RCU mode here */ + dentry = lookup_fast_for_open(nd, open_flag); + if (IS_ERR(dentry)) + return ERR_CAST(dentry); + if (!(open_flag & O_CREAT)) { - if (nd->last.name[nd->last.len]) - nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; - /* we _can_ be in RCU mode here */ - dentry = lookup_fast(nd); - if (IS_ERR(dentry)) - return ERR_CAST(dentry); if (likely(dentry)) goto finish_lookup; if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU)) return ERR_PTR(-ECHILD); } else { - /* create side of things */ if (nd->flags & LOOKUP_RCU) { - if (!try_to_unlazy(nd)) + bool unlazied; + + /* can stay in rcuwalk if not auditing */ + if (dentry && audit_dummy_context()) { + if (trailing_slashes(nd)) + return ERR_PTR(-EISDIR); + goto finish_lookup; + } + unlazied = dentry ? try_to_unlazy_next(nd, dentry) : + try_to_unlazy(nd); + if (!unlazied) return ERR_PTR(-ECHILD); } audit_inode(nd->name, dir, AUDIT_INODE_PARENT); - /* trailing slashes? */ - if (unlikely(nd->last.name[nd->last.len])) + if (trailing_slashes(nd)) { + dput(dentry); return ERR_PTR(-EISDIR); + } + if (dentry) + goto finish_lookup; } if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { From 766843ff3db4a74c8c23570e912bcbcf203fdd62 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 8 Aug 2024 10:16:42 +0200 Subject: [PATCH 115/168] fs: move audit parent inode During O_CREAT we unconditionally audit the parent inode. This makes it difficult to support a fastpath for O_CREAT when the file already exists because we have to drop out of RCU lookup needlessly. We worked around this by checking whether audit was actually active but that's also suboptimal. Instead, move the audit of the parent inode down into lookup_open() at a point where it's mostly certain that the file needs to be created. This also reduced the inconsistency that currently exists: while audit on the parent is done independent of whether or no the file already existed an audit on the file is only performed if it has been created. By moving the audit down a bit we emit the audit a little later but it will allow us to simplify the fastpath for O_CREAT significantly. Signed-off-by: Christian Brauner --- fs/namei.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/namei.c b/fs/namei.c index 3e34f4d97d83..745415fcda57 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3535,6 +3535,9 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, return dentry; } + if (open_flag & O_CREAT) + audit_inode(nd->name, dir, AUDIT_INODE_PARENT); + /* * Checking write permission is tricky, bacuse we don't know if we are * going to actually need it: O_CREAT opens should work as long as the @@ -3691,7 +3694,6 @@ static const char *open_last_lookups(struct nameidata *nd, if (!unlazied) return ERR_PTR(-ECHILD); } - audit_inode(nd->name, dir, AUDIT_INODE_PARENT); if (trailing_slashes(nd)) { dput(dentry); return ERR_PTR(-EISDIR); From 3780d75f74db12ee617437162b9e7b4acd04180b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 8 Aug 2024 11:12:50 +0200 Subject: [PATCH 116/168] fs: pull up trailing slashes check for O_CREAT Perform the check for trailing slashes right in the fastpath check and don't bother with any additional work. Signed-off-by: Christian Brauner --- fs/namei.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 745415fcda57..08eb9a53beb7 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3618,6 +3618,9 @@ static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag) struct dentry *dentry; if (open_flag & O_CREAT) { + if (trailing_slashes(nd)) + return ERR_PTR(-EISDIR); + /* Don't bother on an O_EXCL create */ if (open_flag & O_EXCL) return NULL; @@ -3684,20 +3687,13 @@ static const char *open_last_lookups(struct nameidata *nd, bool unlazied; /* can stay in rcuwalk if not auditing */ - if (dentry && audit_dummy_context()) { - if (trailing_slashes(nd)) - return ERR_PTR(-EISDIR); + if (dentry && audit_dummy_context()) goto finish_lookup; - } unlazied = dentry ? try_to_unlazy_next(nd, dentry) : try_to_unlazy(nd); if (!unlazied) return ERR_PTR(-ECHILD); } - if (trailing_slashes(nd)) { - dput(dentry); - return ERR_PTR(-EISDIR); - } if (dentry) goto finish_lookup; } From b6305e794422e7df0e6ea22892559f2693772752 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 8 Aug 2024 11:15:41 +0200 Subject: [PATCH 117/168] fs: remove audit dummy context check Now that we audit later during lookup_open() we can remove the audit dummy context check. This simplifies things a lot. Signed-off-by: Christian Brauner --- fs/namei.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 08eb9a53beb7..96c79dec7184 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3624,16 +3624,6 @@ static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag) /* Don't bother on an O_EXCL create */ if (open_flag & O_EXCL) return NULL; - - /* - * FIXME: If auditing is enabled, then we'll have to unlazy to - * use the dentry. For now, don't do this, since it shifts - * contention from parent's i_rwsem to its d_lockref spinlock. - * Reconsider this once dentry refcounting handles heavy - * contention better. - */ - if ((nd->flags & LOOKUP_RCU) && !audit_dummy_context()) - return NULL; } if (trailing_slashes(nd)) @@ -3687,7 +3677,7 @@ static const char *open_last_lookups(struct nameidata *nd, bool unlazied; /* can stay in rcuwalk if not auditing */ - if (dentry && audit_dummy_context()) + if (dentry) goto finish_lookup; unlazied = dentry ? try_to_unlazy_next(nd, dentry) : try_to_unlazy(nd); From d5984adcff6aa27c0475b6770e67a28e46c47d3e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 8 Aug 2024 11:16:42 +0200 Subject: [PATCH 118/168] fs: rearrange general fastpath check now that O_CREAT uses it If we find a positive dentry we can now simply try and open it. All prelimiary checks are already done with or without O_CREAT. Signed-off-by: Christian Brauner --- fs/namei.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 96c79dec7184..2699601bf8e9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3666,26 +3666,17 @@ static const char *open_last_lookups(struct nameidata *nd, if (IS_ERR(dentry)) return ERR_CAST(dentry); - if (!(open_flag & O_CREAT)) { - if (likely(dentry)) - goto finish_lookup; + if (likely(dentry)) + goto finish_lookup; + if (!(open_flag & O_CREAT)) { if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU)) return ERR_PTR(-ECHILD); } else { if (nd->flags & LOOKUP_RCU) { - bool unlazied; - - /* can stay in rcuwalk if not auditing */ - if (dentry) - goto finish_lookup; - unlazied = dentry ? try_to_unlazy_next(nd, dentry) : - try_to_unlazy(nd); - if (!unlazied) + if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); } - if (dentry) - goto finish_lookup; } if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { From 152e5c5902b121f9386ddd1c6eb4105c9e08dc34 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Thu, 8 Aug 2024 17:00:24 +0200 Subject: [PATCH 119/168] fs/select: Annotate struct poll_list with __counted_by() Add the __counted_by compiler attribute to the flexible array member entries to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and CONFIG_FORTIFY_SOURCE. Signed-off-by: Thorsten Blum Link: https://lore.kernel.org/r/20240808150023.72578-2-thorsten.blum@toblux.com Reviewed-by: Gustavo A. R. Silva Signed-off-by: Christian Brauner --- fs/select.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/select.c b/fs/select.c index 9515c3fa1a03..1a4849e2afb9 100644 --- a/fs/select.c +++ b/fs/select.c @@ -840,7 +840,7 @@ SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg) struct poll_list { struct poll_list *next; unsigned int len; - struct pollfd entries[]; + struct pollfd entries[] __counted_by(len); }; #define POLLFD_PER_PAGE ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd)) From 3773379a6a5b80cb08ef160172cdd901d3579b79 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sat, 10 Aug 2024 08:47:53 +0200 Subject: [PATCH 120/168] vfs: only read fops once in fops_get/put In do_dentry_open() the usage is: f->f_op = fops_get(inode->i_fop); In generated asm the compiler emits 2 reads from inode->i_fop instead of just one. This popped up due to false-sharing where loads from that offset end up bouncing a cacheline during parallel open. While this is going to be fixed, the spurious load does not need to be there. This makes do_dentry_open() go down from 1177 to 1154 bytes. fops_put() is patched to maintain some consistency. No functional changes. Reviewed-by: Josef Bacik Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20240810064753.1211441-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index c0286d479c9d..696c620f90e0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2553,10 +2553,17 @@ struct super_block *sget(struct file_system_type *type, struct super_block *sget_dev(struct fs_context *fc, dev_t dev); /* Alas, no aliases. Too much hassle with bringing module.h everywhere */ -#define fops_get(fops) \ - (((fops) && try_module_get((fops)->owner) ? (fops) : NULL)) -#define fops_put(fops) \ - do { if (fops) module_put((fops)->owner); } while(0) +#define fops_get(fops) ({ \ + const struct file_operations *_fops = (fops); \ + (((_fops) && try_module_get((_fops)->owner) ? (_fops) : NULL)); \ +}) + +#define fops_put(fops) ({ \ + const struct file_operations *_fops = (fops); \ + if (_fops) \ + module_put((_fops)->owner); \ +}) + /* * This one is to be used *ONLY* from ->open() instances. * fops must be non-NULL, pinned down *and* module dependencies From 9edf11acb190b9e266a9e9faed8a1dfd0a2ea72f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 9 Aug 2024 12:38:56 +0200 Subject: [PATCH 121/168] fs: move FMODE_UNSIGNED_OFFSET to fop_flags This is another flag that is statically set and doesn't need to use up an FMODE_* bit. Move it to ->fop_flags and free up another FMODE_* bit. (1) mem_open() used from proc_mem_operations (2) adi_open() used from adi_fops (3) drm_open_helper(): (3.1) accel_open() used from DRM_ACCEL_FOPS (3.2) drm_open() used from (3.2.1) amdgpu_driver_kms_fops (3.2.2) psb_gem_fops (3.2.3) i915_driver_fops (3.2.4) nouveau_driver_fops (3.2.5) panthor_drm_driver_fops (3.2.6) radeon_driver_kms_fops (3.2.7) tegra_drm_fops (3.2.8) vmwgfx_driver_fops (3.2.9) xe_driver_fops (3.2.10) DRM_GEM_FOPS (3.2.11) DEFINE_DRM_GEM_DMA_FOPS (4) struct memdev sets fmode flags based on type of device opened. For devices using struct mem_fops unsigned offset is used. Mark all these file operations as FOP_UNSIGNED_OFFSET and add asserts into the open helper to ensure that the flag is always set. Link: https://lore.kernel.org/r/20240809-work-fop_unsigned-v1-1-658e054d893e@kernel.org Reviewed-by: Jeff Layton Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- drivers/char/adi.c | 8 +------- drivers/char/mem.c | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 + drivers/gpu/drm/drm_file.c | 3 ++- drivers/gpu/drm/gma500/psb_drv.c | 1 + drivers/gpu/drm/i915/i915_driver.c | 1 + drivers/gpu/drm/nouveau/nouveau_drm.c | 1 + drivers/gpu/drm/radeon/radeon_drv.c | 1 + drivers/gpu/drm/tegra/drm.c | 1 + drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 1 + drivers/gpu/drm/xe/xe_device.c | 1 + fs/proc/base.c | 10 ++++------ fs/read_write.c | 2 +- include/drm/drm_accel.h | 3 ++- include/drm/drm_gem.h | 3 ++- include/drm/drm_gem_dma_helper.h | 1 + include/linux/fs.h | 5 +++-- mm/mmap.c | 2 +- 18 files changed, 27 insertions(+), 21 deletions(-) diff --git a/drivers/char/adi.c b/drivers/char/adi.c index 751d7cc0da1b..1c76c8758f0f 100644 --- a/drivers/char/adi.c +++ b/drivers/char/adi.c @@ -14,12 +14,6 @@ #define MAX_BUF_SZ PAGE_SIZE -static int adi_open(struct inode *inode, struct file *file) -{ - file->f_mode |= FMODE_UNSIGNED_OFFSET; - return 0; -} - static int read_mcd_tag(unsigned long addr) { long err; @@ -206,9 +200,9 @@ static loff_t adi_llseek(struct file *file, loff_t offset, int whence) static const struct file_operations adi_fops = { .owner = THIS_MODULE, .llseek = adi_llseek, - .open = adi_open, .read = adi_read, .write = adi_write, + .fop_flags = FOP_UNSIGNED_OFFSET, }; static struct miscdevice adi_miscdev = { diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 7c359cc406d5..169eed162a7f 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -643,6 +643,7 @@ static const struct file_operations __maybe_unused mem_fops = { .get_unmapped_area = get_unmapped_area_mem, .mmap_capabilities = memory_mmap_capabilities, #endif + .fop_flags = FOP_UNSIGNED_OFFSET, }; static const struct file_operations null_fops = { @@ -693,7 +694,7 @@ static const struct memdev { umode_t mode; } devlist[] = { #ifdef CONFIG_DEVMEM - [DEVMEM_MINOR] = { "mem", &mem_fops, FMODE_UNSIGNED_OFFSET, 0 }, + [DEVMEM_MINOR] = { "mem", &mem_fops, 0, 0 }, #endif [3] = { "null", &null_fops, FMODE_NOWAIT, 0666 }, #ifdef CONFIG_DEVPORT diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 094498a0964b..d7ef8cbecf6c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2908,6 +2908,7 @@ static const struct file_operations amdgpu_driver_kms_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = drm_show_fdinfo, #endif + .fop_flags = FOP_UNSIGNED_OFFSET, }; int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv **fpriv) diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c index 714e42b05108..f8de3cba1a08 100644 --- a/drivers/gpu/drm/drm_file.c +++ b/drivers/gpu/drm/drm_file.c @@ -318,6 +318,8 @@ int drm_open_helper(struct file *filp, struct drm_minor *minor) if (dev->switch_power_state != DRM_SWITCH_POWER_ON && dev->switch_power_state != DRM_SWITCH_POWER_DYNAMIC_OFF) return -EINVAL; + if (WARN_ON_ONCE(!(filp->f_op->fop_flags & FOP_UNSIGNED_OFFSET))) + return -EINVAL; drm_dbg_core(dev, "comm=\"%s\", pid=%d, minor=%d\n", current->comm, task_pid_nr(current), minor->index); @@ -335,7 +337,6 @@ int drm_open_helper(struct file *filp, struct drm_minor *minor) } filp->private_data = priv; - filp->f_mode |= FMODE_UNSIGNED_OFFSET; priv->filp = filp; mutex_lock(&dev->filelist_mutex); diff --git a/drivers/gpu/drm/gma500/psb_drv.c b/drivers/gpu/drm/gma500/psb_drv.c index 8b64f61ffaf9..d67c2b3ad901 100644 --- a/drivers/gpu/drm/gma500/psb_drv.c +++ b/drivers/gpu/drm/gma500/psb_drv.c @@ -498,6 +498,7 @@ static const struct file_operations psb_gem_fops = { .mmap = drm_gem_mmap, .poll = drm_poll, .read = drm_read, + .fop_flags = FOP_UNSIGNED_OFFSET, }; static const struct drm_driver driver = { diff --git a/drivers/gpu/drm/i915/i915_driver.c b/drivers/gpu/drm/i915/i915_driver.c index fb8e9c2fcea5..cf276299bccb 100644 --- a/drivers/gpu/drm/i915/i915_driver.c +++ b/drivers/gpu/drm/i915/i915_driver.c @@ -1671,6 +1671,7 @@ static const struct file_operations i915_driver_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = drm_show_fdinfo, #endif + .fop_flags = FOP_UNSIGNED_OFFSET, }; static int diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c index a58c31089613..e243b42f8582 100644 --- a/drivers/gpu/drm/nouveau/nouveau_drm.c +++ b/drivers/gpu/drm/nouveau/nouveau_drm.c @@ -1274,6 +1274,7 @@ nouveau_driver_fops = { .compat_ioctl = nouveau_compat_ioctl, #endif .llseek = noop_llseek, + .fop_flags = FOP_UNSIGNED_OFFSET, }; static struct drm_driver diff --git a/drivers/gpu/drm/radeon/radeon_drv.c b/drivers/gpu/drm/radeon/radeon_drv.c index 7bf08164140e..ac49779ed03d 100644 --- a/drivers/gpu/drm/radeon/radeon_drv.c +++ b/drivers/gpu/drm/radeon/radeon_drv.c @@ -520,6 +520,7 @@ static const struct file_operations radeon_driver_kms_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = radeon_kms_compat_ioctl, #endif + .fop_flags = FOP_UNSIGNED_OFFSET, }; static const struct drm_ioctl_desc radeon_ioctls_kms[] = { diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c index 03d1c76aec2d..108c26a33edb 100644 --- a/drivers/gpu/drm/tegra/drm.c +++ b/drivers/gpu/drm/tegra/drm.c @@ -801,6 +801,7 @@ static const struct file_operations tegra_drm_fops = { .read = drm_read, .compat_ioctl = drm_compat_ioctl, .llseek = noop_llseek, + .fop_flags = FOP_UNSIGNED_OFFSET, }; static int tegra_drm_context_cleanup(int id, void *p, void *data) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c index 50ad3105c16e..2825dd3149ed 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c @@ -1609,6 +1609,7 @@ static const struct file_operations vmwgfx_driver_fops = { .compat_ioctl = vmw_compat_ioctl, #endif .llseek = noop_llseek, + .fop_flags = FOP_UNSIGNED_OFFSET, }; static const struct drm_driver driver = { diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index f2f1d8ddb221..2e5537d6608d 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -238,6 +238,7 @@ static const struct file_operations xe_driver_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = drm_show_fdinfo, #endif + .fop_flags = FOP_UNSIGNED_OFFSET, }; static struct drm_driver driver = { diff --git a/fs/proc/base.c b/fs/proc/base.c index f389c69767fa..65818de11d7c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -862,12 +862,9 @@ static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) static int mem_open(struct inode *inode, struct file *file) { - int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH); - - /* OK to pass negative loff_t, we can catch out-of-range */ - file->f_mode |= FMODE_UNSIGNED_OFFSET; - - return ret; + if (WARN_ON_ONCE(!(file->f_op->fop_flags & FOP_UNSIGNED_OFFSET))) + return -EINVAL; + return __mem_open(inode, file, PTRACE_MODE_ATTACH); } static bool proc_mem_foll_force(struct file *file, struct mm_struct *mm) @@ -991,6 +988,7 @@ static const struct file_operations proc_mem_operations = { .write = mem_write, .open = mem_open, .release = mem_release, + .fop_flags = FOP_UNSIGNED_OFFSET, }; static int environ_open(struct inode *inode, struct file *file) diff --git a/fs/read_write.c b/fs/read_write.c index 90e283b31ca1..89d4af0e3b93 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -36,7 +36,7 @@ EXPORT_SYMBOL(generic_ro_fops); static inline bool unsigned_offsets(struct file *file) { - return file->f_mode & FMODE_UNSIGNED_OFFSET; + return file->f_op->fop_flags & FOP_UNSIGNED_OFFSET; } /** diff --git a/include/drm/drm_accel.h b/include/drm/drm_accel.h index f4d3784b1dce..41c78b7d712c 100644 --- a/include/drm/drm_accel.h +++ b/include/drm/drm_accel.h @@ -28,7 +28,8 @@ .poll = drm_poll,\ .read = drm_read,\ .llseek = noop_llseek, \ - .mmap = drm_gem_mmap + .mmap = drm_gem_mmap, \ + .fop_flags = FOP_UNSIGNED_OFFSET /** * DEFINE_DRM_ACCEL_FOPS() - macro to generate file operations for accelerators drivers diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h index bae4865b2101..d8b86df2ec0d 100644 --- a/include/drm/drm_gem.h +++ b/include/drm/drm_gem.h @@ -447,7 +447,8 @@ struct drm_gem_object { .poll = drm_poll,\ .read = drm_read,\ .llseek = noop_llseek,\ - .mmap = drm_gem_mmap + .mmap = drm_gem_mmap, \ + .fop_flags = FOP_UNSIGNED_OFFSET /** * DEFINE_DRM_GEM_FOPS() - macro to generate file operations for GEM drivers diff --git a/include/drm/drm_gem_dma_helper.h b/include/drm/drm_gem_dma_helper.h index a827bde494f6..f2678e7ecb98 100644 --- a/include/drm/drm_gem_dma_helper.h +++ b/include/drm/drm_gem_dma_helper.h @@ -267,6 +267,7 @@ unsigned long drm_gem_dma_get_unmapped_area(struct file *filp, .read = drm_read,\ .llseek = noop_llseek,\ .mmap = drm_gem_mmap,\ + .fop_flags = FOP_UNSIGNED_OFFSET, \ DRM_GEM_DMA_UNMAPPED_AREA_FOPS \ } diff --git a/include/linux/fs.h b/include/linux/fs.h index 696c620f90e0..0145bda465ff 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -146,8 +146,7 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* Expect random access pattern */ #define FMODE_RANDOM ((__force fmode_t)(1 << 12)) -/* File is huge (eg. /dev/mem): treat loff_t as unsigned */ -#define FMODE_UNSIGNED_OFFSET ((__force fmode_t)(1 << 13)) +/* FMODE_* bit 13 */ /* File is opened with O_PATH; almost nothing can be done with it */ #define FMODE_PATH ((__force fmode_t)(1 << 14)) @@ -2073,6 +2072,8 @@ struct file_operations { #define FOP_DIO_PARALLEL_WRITE ((__force fop_flags_t)(1 << 3)) /* Contains huge pages */ #define FOP_HUGE_PAGES ((__force fop_flags_t)(1 << 4)) +/* Treat loff_t as unsigned (e.g., /dev/mem) */ +#define FOP_UNSIGNED_OFFSET ((__force fop_flags_t)(1 << 5)) /* Wrap a directory iterator that needs exclusive inode access */ int wrap_directory_iterator(struct file *, struct dir_context *, diff --git a/mm/mmap.c b/mm/mmap.c index d0dfc85b209b..6ddb278a5ee8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1229,7 +1229,7 @@ static inline u64 file_mmap_size_max(struct file *file, struct inode *inode) return MAX_LFS_FILESIZE; /* Special "we do even unsigned file positions" case */ - if (file->f_mode & FMODE_UNSIGNED_OFFSET) + if (file->f_op->fop_flags & FOP_UNSIGNED_OFFSET) return 0; /* Yes, random drivers might want more. But I'm tired of buggy drivers */ From 6b7ab39332903e5e7bdf3056352caca854ab645b Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Mon, 15 Jul 2024 09:13:24 +0200 Subject: [PATCH 122/168] vfs: use RCU in ilookup A soft lockup in ilookup was reported when stress-testing a 512-way system [1] (see [2] for full context) and it was verified that not taking the lock shifts issues back to mm. [1] https://lore.kernel.org/linux-mm/56865e57-c250-44da-9713-cf1404595bcc@amd.com/ [2] https://lore.kernel.org/linux-mm/d2841226-e27b-4d3d-a578-63587a3aa4f3@amd.com/ Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20240715071324.265879-1-mjguzik@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/inode.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 10c4619faeef..de0969d6009c 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1570,9 +1570,7 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino) struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; again: - spin_lock(&inode_hash_lock); - inode = find_inode_fast(sb, head, ino, true); - spin_unlock(&inode_hash_lock); + inode = find_inode_fast(sb, head, ino, false); if (inode) { if (IS_ERR(inode)) From 84e0e03b308816a48c67f6da2168fcea6d49eda8 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Fri, 23 Aug 2024 00:31:06 +0100 Subject: [PATCH 123/168] Squashfs: Ensure all readahead pages have been used In the recent work to remove page->index, a sanity check that ensured all the readhead pages were covered by the Squashfs data block was removed [1]. To avoid any regression, this commit adds the sanity check back in an equivalent way. Namely the page actor will now return error if any pages are unused after completion. [1] https://lore.kernel.org/all/20240818235847.170468-3-phillip@squashfs.org.uk/ -- Signed-off-by: Phillip Lougher Link: https://lore.kernel.org/r/20240822233106.121522-1-phillip@squashfs.org.uk V3: last_page should be actor->last_page Signed-off-by: Christian Brauner --- fs/squashfs/file.c | 4 ++-- fs/squashfs/file_direct.c | 2 +- fs/squashfs/page_actor.h | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index 5a3745e52025..21aaa96856c1 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -535,7 +535,7 @@ static int squashfs_readahead_fragment(struct page **page, last_page = squashfs_page_actor_free(actor); - if (copied == expected) { + if (copied == expected && !IS_ERR(last_page)) { /* Last page (if present) may have trailing bytes not filled */ bytes = copied % PAGE_SIZE; if (bytes && last_page) @@ -625,7 +625,7 @@ static void squashfs_readahead(struct readahead_control *ractl) last_page = squashfs_page_actor_free(actor); - if (res == expected) { + if (res == expected && !IS_ERR(last_page)) { int bytes; /* Last page (if present) may have trailing bytes not filled */ diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index 646d4d421f99..22251743fadf 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -80,7 +80,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, if (res < 0) goto mark_errored; - if (res != expected) { + if (res != expected || IS_ERR(last_page)) { res = -EIO; goto mark_errored; } diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h index c6d837f0e9ca..ffe25eb77c32 100644 --- a/fs/squashfs/page_actor.h +++ b/fs/squashfs/page_actor.h @@ -33,10 +33,11 @@ extern struct squashfs_page_actor *squashfs_page_actor_init_special( loff_t start_index); static inline struct page *squashfs_page_actor_free(struct squashfs_page_actor *actor) { - struct page *last_page = actor->last_page; + struct page *last_page = actor->next_page == actor->pages ? actor->last_page : ERR_PTR(-EIO); kfree(actor->tmp_buffer); kfree(actor); + return last_page; } static inline void *squashfs_first_page(struct squashfs_page_actor *actor) From e8201b314c01e44bade9c3f8b60c1d8a71770c7a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 22 Aug 2024 15:50:09 +0200 Subject: [PATCH 124/168] fs: Allow fine-grained control of folio sizes We need filesystems to be able to communicate acceptable folio sizes to the pagecache for a variety of uses (e.g. large block sizes). Support a range of folio sizes between order-0 and order-31. Signed-off-by: Matthew Wilcox (Oracle) Co-developed-by: Pankaj Raghav Signed-off-by: Pankaj Raghav Link: https://lore.kernel.org/r/20240822135018.1931258-2-kernel@pankajraghav.com Tested-by: David Howells Reviewed-by: Hannes Reinecke Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- include/linux/pagemap.h | 90 ++++++++++++++++++++++++++++++++++------- mm/filemap.c | 6 +-- mm/readahead.c | 4 +- 3 files changed, 80 insertions(+), 20 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index d9c7edb6422b..c60025bb584c 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -204,14 +204,21 @@ enum mapping_flags { AS_EXITING = 4, /* final truncate in progress */ /* writeback related tags are not used */ AS_NO_WRITEBACK_TAGS = 5, - AS_LARGE_FOLIO_SUPPORT = 6, - AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */ - AS_STABLE_WRITES, /* must wait for writeback before modifying + AS_RELEASE_ALWAYS = 6, /* Call ->release_folio(), even if no private data */ + AS_STABLE_WRITES = 7, /* must wait for writeback before modifying folio contents */ - AS_INACCESSIBLE, /* Do not attempt direct R/W access to the mapping, - including to move the mapping */ + AS_INACCESSIBLE = 8, /* Do not attempt direct R/W access to the mapping */ + /* Bits 16-25 are used for FOLIO_ORDER */ + AS_FOLIO_ORDER_BITS = 5, + AS_FOLIO_ORDER_MIN = 16, + AS_FOLIO_ORDER_MAX = AS_FOLIO_ORDER_MIN + AS_FOLIO_ORDER_BITS, }; +#define AS_FOLIO_ORDER_BITS_MASK ((1u << AS_FOLIO_ORDER_BITS) - 1) +#define AS_FOLIO_ORDER_MIN_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MIN) +#define AS_FOLIO_ORDER_MAX_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MAX) +#define AS_FOLIO_ORDER_MASK (AS_FOLIO_ORDER_MIN_MASK | AS_FOLIO_ORDER_MAX_MASK) + /** * mapping_set_error - record a writeback error in the address_space * @mapping: the mapping in which an error should be set @@ -367,9 +374,51 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) #define MAX_XAS_ORDER (XA_CHUNK_SHIFT * 2 - 1) #define MAX_PAGECACHE_ORDER min(MAX_XAS_ORDER, PREFERRED_MAX_PAGECACHE_ORDER) +/* + * mapping_set_folio_order_range() - Set the orders supported by a file. + * @mapping: The address space of the file. + * @min: Minimum folio order (between 0-MAX_PAGECACHE_ORDER inclusive). + * @max: Maximum folio order (between @min-MAX_PAGECACHE_ORDER inclusive). + * + * The filesystem should call this function in its inode constructor to + * indicate which base size (min) and maximum size (max) of folio the VFS + * can use to cache the contents of the file. This should only be used + * if the filesystem needs special handling of folio sizes (ie there is + * something the core cannot know). + * Do not tune it based on, eg, i_size. + * + * Context: This should not be called while the inode is active as it + * is non-atomic. + */ +static inline void mapping_set_folio_order_range(struct address_space *mapping, + unsigned int min, + unsigned int max) +{ + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + return; + + if (min > MAX_PAGECACHE_ORDER) + min = MAX_PAGECACHE_ORDER; + + if (max > MAX_PAGECACHE_ORDER) + max = MAX_PAGECACHE_ORDER; + + if (max < min) + max = min; + + mapping->flags = (mapping->flags & ~AS_FOLIO_ORDER_MASK) | + (min << AS_FOLIO_ORDER_MIN) | (max << AS_FOLIO_ORDER_MAX); +} + +static inline void mapping_set_folio_min_order(struct address_space *mapping, + unsigned int min) +{ + mapping_set_folio_order_range(mapping, min, MAX_PAGECACHE_ORDER); +} + /** * mapping_set_large_folios() - Indicate the file supports large folios. - * @mapping: The file. + * @mapping: The address space of the file. * * The filesystem should call this function in its inode constructor to * indicate that the VFS can use large folios to cache the contents of @@ -380,7 +429,23 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) */ static inline void mapping_set_large_folios(struct address_space *mapping) { - __set_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); + mapping_set_folio_order_range(mapping, 0, MAX_PAGECACHE_ORDER); +} + +static inline unsigned int +mapping_max_folio_order(const struct address_space *mapping) +{ + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + return 0; + return (mapping->flags & AS_FOLIO_ORDER_MAX_MASK) >> AS_FOLIO_ORDER_MAX; +} + +static inline unsigned int +mapping_min_folio_order(const struct address_space *mapping) +{ + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + return 0; + return (mapping->flags & AS_FOLIO_ORDER_MIN_MASK) >> AS_FOLIO_ORDER_MIN; } /* @@ -389,20 +454,17 @@ static inline void mapping_set_large_folios(struct address_space *mapping) */ static inline bool mapping_large_folio_support(struct address_space *mapping) { - /* AS_LARGE_FOLIO_SUPPORT is only reasonable for pagecache folios */ + /* AS_FOLIO_ORDER is only reasonable for pagecache folios */ VM_WARN_ONCE((unsigned long)mapping & PAGE_MAPPING_ANON, "Anonymous mapping always supports large folio"); - return IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && - test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); + return mapping_max_folio_order(mapping) > 0; } /* Return the maximum folio size for this pagecache mapping, in bytes. */ -static inline size_t mapping_max_folio_size(struct address_space *mapping) +static inline size_t mapping_max_folio_size(const struct address_space *mapping) { - if (mapping_large_folio_support(mapping)) - return PAGE_SIZE << MAX_PAGECACHE_ORDER; - return PAGE_SIZE; + return PAGE_SIZE << mapping_max_folio_order(mapping); } static inline int filemap_nr_thps(struct address_space *mapping) diff --git a/mm/filemap.c b/mm/filemap.c index d62150418b91..ad5e4a848070 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1933,10 +1933,8 @@ no_page: if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) fgp_flags |= FGP_LOCK; - if (!mapping_large_folio_support(mapping)) - order = 0; - if (order > MAX_PAGECACHE_ORDER) - order = MAX_PAGECACHE_ORDER; + if (order > mapping_max_folio_order(mapping)) + order = mapping_max_folio_order(mapping); /* If we're not aligned, allocate a smaller folio */ if (index & ((1UL << order) - 1)) order = __ffs(index); diff --git a/mm/readahead.c b/mm/readahead.c index 517c0be7ce66..3e5239e9e177 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -449,10 +449,10 @@ void page_cache_ra_order(struct readahead_control *ractl, limit = min(limit, index + ra->size - 1); - if (new_order < MAX_PAGECACHE_ORDER) + if (new_order < mapping_max_folio_order(mapping)) new_order += 2; - new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order); + new_order = min(mapping_max_folio_order(mapping), new_order); new_order = min_t(unsigned int, new_order, ilog2(ra->size)); /* See comment in page_cache_ra_unbounded() */ From c104d25f8c4955e1848eda117a5d276b83409f33 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Thu, 22 Aug 2024 15:50:10 +0200 Subject: [PATCH 125/168] filemap: allocate mapping_min_order folios in the page cache filemap_create_folio() and do_read_cache_folio() were always allocating folio of order 0. __filemap_get_folio was trying to allocate higher order folios when fgp_flags had higher order hint set but it will default to order 0 folio if higher order memory allocation fails. Supporting mapping_min_order implies that we guarantee each folio in the page cache has at least an order of mapping_min_order. When adding new folios to the page cache we must also ensure the index used is aligned to the mapping_min_order as the page cache requires the index to be aligned to the order of the folio. Co-developed-by: Luis Chamberlain Signed-off-by: Luis Chamberlain Signed-off-by: Pankaj Raghav Link: https://lore.kernel.org/r/20240822135018.1931258-3-kernel@pankajraghav.com Tested-by: David Howells Reviewed-by: Hannes Reinecke Reviewed-by: Darrick J. Wong Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- include/linux/pagemap.h | 20 ++++++++++++++++++++ mm/filemap.c | 24 ++++++++++++++++-------- 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index c60025bb584c..4cc170949e9c 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -448,6 +448,26 @@ mapping_min_folio_order(const struct address_space *mapping) return (mapping->flags & AS_FOLIO_ORDER_MIN_MASK) >> AS_FOLIO_ORDER_MIN; } +static inline unsigned long +mapping_min_folio_nrpages(struct address_space *mapping) +{ + return 1UL << mapping_min_folio_order(mapping); +} + +/** + * mapping_align_index() - Align index for this mapping. + * @mapping: The address_space. + * + * The index of a folio must be naturally aligned. If you are adding a + * new folio to the page cache and need to know what index to give it, + * call this function. + */ +static inline pgoff_t mapping_align_index(struct address_space *mapping, + pgoff_t index) +{ + return round_down(index, mapping_min_folio_nrpages(mapping)); +} + /* * Large folio support currently depends on THP. These dependencies are * being worked on but are not yet fixed. diff --git a/mm/filemap.c b/mm/filemap.c index ad5e4a848070..d27e9ac54309 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -859,6 +859,8 @@ noinline int __filemap_add_folio(struct address_space *mapping, VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio); + VM_BUG_ON_FOLIO(folio_order(folio) < mapping_min_folio_order(mapping), + folio); mapping_set_update(&xas, mapping); VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); @@ -1919,8 +1921,10 @@ repeat: folio_wait_stable(folio); no_page: if (!folio && (fgp_flags & FGP_CREAT)) { - unsigned order = FGF_GET_ORDER(fgp_flags); + unsigned int min_order = mapping_min_folio_order(mapping); + unsigned int order = max(min_order, FGF_GET_ORDER(fgp_flags)); int err; + index = mapping_align_index(mapping, index); if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping)) gfp |= __GFP_WRITE; @@ -1943,7 +1947,7 @@ no_page: gfp_t alloc_gfp = gfp; err = -ENOMEM; - if (order > 0) + if (order > min_order) alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN; folio = filemap_alloc_folio(alloc_gfp, order); if (!folio) @@ -1958,7 +1962,7 @@ no_page: break; folio_put(folio); folio = NULL; - } while (order-- > 0); + } while (order-- > min_order); if (err == -EEXIST) goto repeat; @@ -2447,13 +2451,15 @@ unlock_mapping: } static int filemap_create_folio(struct file *file, - struct address_space *mapping, pgoff_t index, + struct address_space *mapping, loff_t pos, struct folio_batch *fbatch) { struct folio *folio; int error; + unsigned int min_order = mapping_min_folio_order(mapping); + pgoff_t index; - folio = filemap_alloc_folio(mapping_gfp_mask(mapping), 0); + folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order); if (!folio) return -ENOMEM; @@ -2471,6 +2477,7 @@ static int filemap_create_folio(struct file *file, * well to keep locking rules simple. */ filemap_invalidate_lock_shared(mapping); + index = (pos >> (PAGE_SHIFT + min_order)) << min_order; error = filemap_add_folio(mapping, folio, index, mapping_gfp_constraint(mapping, GFP_KERNEL)); if (error == -EEXIST) @@ -2531,8 +2538,7 @@ retry: if (!folio_batch_count(fbatch)) { if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) return -EAGAIN; - err = filemap_create_folio(filp, mapping, - iocb->ki_pos >> PAGE_SHIFT, fbatch); + err = filemap_create_folio(filp, mapping, iocb->ki_pos, fbatch); if (err == AOP_TRUNCATED_PAGE) goto retry; return err; @@ -3748,9 +3754,11 @@ static struct folio *do_read_cache_folio(struct address_space *mapping, repeat: folio = filemap_get_folio(mapping, index); if (IS_ERR(folio)) { - folio = filemap_alloc_folio(gfp, 0); + folio = filemap_alloc_folio(gfp, + mapping_min_folio_order(mapping)); if (!folio) return ERR_PTR(-ENOMEM); + index = mapping_align_index(mapping, index); err = filemap_add_folio(mapping, folio, index, gfp); if (unlikely(err)) { folio_put(folio); From 7949d4e70aefd7033fbf824837b841a5d2ebe129 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Thu, 22 Aug 2024 15:50:11 +0200 Subject: [PATCH 126/168] readahead: allocate folios with mapping_min_order in readahead page_cache_ra_unbounded() was allocating single pages (0 order folios) if there was no folio found in an index. Allocate mapping_min_order folios as we need to guarantee the minimum order if it is set. page_cache_ra_order() tries to allocate folio to the page cache with a higher order if the index aligns with that order. Modify it so that the order does not go below the mapping_min_order requirement of the page cache. This function will do the right thing even if the new_order passed is less than the mapping_min_order. When adding new folios to the page cache we must also ensure the index used is aligned to the mapping_min_order as the page cache requires the index to be aligned to the order of the folio. readahead_expand() is called from readahead aops to extend the range of the readahead so this function can assume ractl->_index to be aligned with min_order. Signed-off-by: Pankaj Raghav Co-developed-by: Hannes Reinecke Signed-off-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240822135018.1931258-4-kernel@pankajraghav.com Tested-by: David Howells Acked-by: Darrick J. Wong Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- mm/readahead.c | 79 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 61 insertions(+), 18 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index 3e5239e9e177..2078c42777a6 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -206,9 +206,10 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, unsigned long nr_to_read, unsigned long lookahead_size) { struct address_space *mapping = ractl->mapping; - unsigned long index = readahead_index(ractl); + unsigned long ra_folio_index, index = readahead_index(ractl); gfp_t gfp_mask = readahead_gfp_mask(mapping); - unsigned long i; + unsigned long mark, i = 0; + unsigned int min_nrpages = mapping_min_folio_nrpages(mapping); /* * Partway through the readahead operation, we will have added @@ -223,10 +224,24 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, unsigned int nofs = memalloc_nofs_save(); filemap_invalidate_lock_shared(mapping); + index = mapping_align_index(mapping, index); + + /* + * As iterator `i` is aligned to min_nrpages, round_up the + * difference between nr_to_read and lookahead_size to mark the + * index that only has lookahead or "async_region" to set the + * readahead flag. + */ + ra_folio_index = round_up(readahead_index(ractl) + nr_to_read - lookahead_size, + min_nrpages); + mark = ra_folio_index - index; + nr_to_read += readahead_index(ractl) - index; + ractl->_index = index; + /* * Preallocate as many pages as we will need. */ - for (i = 0; i < nr_to_read; i++) { + while (i < nr_to_read) { struct folio *folio = xa_load(&mapping->i_pages, index + i); int ret; @@ -240,12 +255,13 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, * not worth getting one just for that. */ read_pages(ractl); - ractl->_index++; - i = ractl->_index + ractl->_nr_pages - index - 1; + ractl->_index += min_nrpages; + i = ractl->_index + ractl->_nr_pages - index; continue; } - folio = filemap_alloc_folio(gfp_mask, 0); + folio = filemap_alloc_folio(gfp_mask, + mapping_min_folio_order(mapping)); if (!folio) break; @@ -255,14 +271,15 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, if (ret == -ENOMEM) break; read_pages(ractl); - ractl->_index++; - i = ractl->_index + ractl->_nr_pages - index - 1; + ractl->_index += min_nrpages; + i = ractl->_index + ractl->_nr_pages - index; continue; } - if (i == nr_to_read - lookahead_size) + if (i == mark) folio_set_readahead(folio); ractl->_workingset |= folio_test_workingset(folio); - ractl->_nr_pages++; + ractl->_nr_pages += min_nrpages; + i += min_nrpages; } /* @@ -438,13 +455,19 @@ void page_cache_ra_order(struct readahead_control *ractl, struct address_space *mapping = ractl->mapping; pgoff_t start = readahead_index(ractl); pgoff_t index = start; + unsigned int min_order = mapping_min_folio_order(mapping); pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; pgoff_t mark = index + ra->size - ra->async_size; unsigned int nofs; int err = 0; gfp_t gfp = readahead_gfp_mask(mapping); + unsigned int min_ra_size = max(4, mapping_min_folio_nrpages(mapping)); - if (!mapping_large_folio_support(mapping) || ra->size < 4) + /* + * Fallback when size < min_nrpages as each folio should be + * at least min_nrpages anyway. + */ + if (!mapping_large_folio_support(mapping) || ra->size < min_ra_size) goto fallback; limit = min(limit, index + ra->size - 1); @@ -454,10 +477,19 @@ void page_cache_ra_order(struct readahead_control *ractl, new_order = min(mapping_max_folio_order(mapping), new_order); new_order = min_t(unsigned int, new_order, ilog2(ra->size)); + new_order = max(new_order, min_order); /* See comment in page_cache_ra_unbounded() */ nofs = memalloc_nofs_save(); filemap_invalidate_lock_shared(mapping); + /* + * If the new_order is greater than min_order and index is + * already aligned to new_order, then this will be noop as index + * aligned to new_order should also be aligned to min_order. + */ + ractl->_index = mapping_align_index(mapping, index); + index = readahead_index(ractl); + while (index <= limit) { unsigned int order = new_order; @@ -465,7 +497,7 @@ void page_cache_ra_order(struct readahead_control *ractl, if (index & ((1UL << order) - 1)) order = __ffs(index); /* Don't allocate pages past EOF */ - while (index + (1UL << order) - 1 > limit) + while (order > min_order && index + (1UL << order) - 1 > limit) order--; err = ra_alloc_folio(ractl, index, mark, order, gfp); if (err) @@ -703,8 +735,15 @@ void readahead_expand(struct readahead_control *ractl, struct file_ra_state *ra = ractl->ra; pgoff_t new_index, new_nr_pages; gfp_t gfp_mask = readahead_gfp_mask(mapping); + unsigned long min_nrpages = mapping_min_folio_nrpages(mapping); + unsigned int min_order = mapping_min_folio_order(mapping); new_index = new_start / PAGE_SIZE; + /* + * Readahead code should have aligned the ractl->_index to + * min_nrpages before calling readahead aops. + */ + VM_BUG_ON(!IS_ALIGNED(ractl->_index, min_nrpages)); /* Expand the leading edge downwards */ while (ractl->_index > new_index) { @@ -714,9 +753,11 @@ void readahead_expand(struct readahead_control *ractl, if (folio && !xa_is_value(folio)) return; /* Folio apparently present */ - folio = filemap_alloc_folio(gfp_mask, 0); + folio = filemap_alloc_folio(gfp_mask, min_order); if (!folio) return; + + index = mapping_align_index(mapping, index); if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) { folio_put(folio); return; @@ -726,7 +767,7 @@ void readahead_expand(struct readahead_control *ractl, ractl->_workingset = true; psi_memstall_enter(&ractl->_pflags); } - ractl->_nr_pages++; + ractl->_nr_pages += min_nrpages; ractl->_index = folio->index; } @@ -741,9 +782,11 @@ void readahead_expand(struct readahead_control *ractl, if (folio && !xa_is_value(folio)) return; /* Folio apparently present */ - folio = filemap_alloc_folio(gfp_mask, 0); + folio = filemap_alloc_folio(gfp_mask, min_order); if (!folio) return; + + index = mapping_align_index(mapping, index); if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) { folio_put(folio); return; @@ -753,10 +796,10 @@ void readahead_expand(struct readahead_control *ractl, ractl->_workingset = true; psi_memstall_enter(&ractl->_pflags); } - ractl->_nr_pages++; + ractl->_nr_pages += min_nrpages; if (ra) { - ra->size++; - ra->async_size++; + ra->size += min_nrpages; + ra->async_size += min_nrpages; } } } From fd031210c9ceb399db1dea001c6a5e98f3b4e2e7 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Thu, 22 Aug 2024 15:50:12 +0200 Subject: [PATCH 127/168] mm: split a folio in minimum folio order chunks split_folio() and split_folio_to_list() assume order 0, to support minorder for non-anonymous folios, we must expand these to check the folio mapping order and use that. Set new_order to be at least minimum folio order if it is set in split_huge_page_to_list() so that we can maintain minimum folio order requirement in the page cache. Update the debugfs write files used for testing to ensure the order is respected as well. We simply enforce the min order when a file mapping is used. Signed-off-by: Luis Chamberlain Signed-off-by: Pankaj Raghav Link: https://lore.kernel.org/r/20240822135018.1931258-5-kernel@pankajraghav.com Tested-by: David Howells Reviewed-by: Hannes Reinecke Reviewed-by: Zi Yan Signed-off-by: Christian Brauner --- include/linux/huge_mm.h | 14 +++++++--- mm/huge_memory.c | 60 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 66 insertions(+), 8 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index e25d9ebfdf89..7c50aeed0522 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -96,6 +96,8 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; #define thp_vma_allowable_order(vma, vm_flags, tva_flags, order) \ (!!thp_vma_allowable_orders(vma, vm_flags, tva_flags, BIT(order))) +#define split_folio(f) split_folio_to_list(f, NULL) + #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES #define HPAGE_PMD_SHIFT PMD_SHIFT #define HPAGE_PUD_SHIFT PUD_SHIFT @@ -317,9 +319,10 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add bool can_split_folio(struct folio *folio, int *pextra_pins); int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, unsigned int new_order); +int split_folio_to_list(struct folio *folio, struct list_head *list); static inline int split_huge_page(struct page *page) { - return split_huge_page_to_list_to_order(page, NULL, 0); + return split_folio(page_folio(page)); } void deferred_split_folio(struct folio *folio); @@ -484,6 +487,12 @@ static inline int split_huge_page(struct page *page) { return 0; } + +static inline int split_folio_to_list(struct folio *folio, struct list_head *list) +{ + return 0; +} + static inline void deferred_split_folio(struct folio *folio) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) @@ -598,7 +607,4 @@ static inline int split_folio_to_order(struct folio *folio, int new_order) return split_folio_to_list_to_order(folio, NULL, new_order); } -#define split_folio_to_list(f, l) split_folio_to_list_to_order(f, l, 0) -#define split_folio(f) split_folio_to_order(f, 0) - #endif /* _LINUX_HUGE_MM_H */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f4be468e06a4..c29af9451d92 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3082,6 +3082,9 @@ bool can_split_folio(struct folio *folio, int *pextra_pins) * released, or if some unexpected race happened (e.g., anon VMA disappeared, * truncation). * + * Callers should ensure that the order respects the address space mapping + * min-order if one is set for non-anonymous folios. + * * Returns -EINVAL when trying to split to an order that is incompatible * with the folio. Splitting to order 0 is compatible with all folios. */ @@ -3163,6 +3166,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, mapping = NULL; anon_vma_lock_write(anon_vma); } else { + unsigned int min_order; gfp_t gfp; mapping = folio->mapping; @@ -3173,6 +3177,14 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, goto out; } + min_order = mapping_min_folio_order(folio->mapping); + if (new_order < min_order) { + VM_WARN_ONCE(1, "Cannot split mapped folio below min-order: %u", + min_order); + ret = -EINVAL; + goto out; + } + gfp = current_gfp_context(mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK); @@ -3285,6 +3297,25 @@ out: return ret; } +int split_folio_to_list(struct folio *folio, struct list_head *list) +{ + unsigned int min_order = 0; + + if (folio_test_anon(folio)) + goto out; + + if (!folio->mapping) { + if (folio_test_pmd_mappable(folio)) + count_vm_event(THP_SPLIT_PAGE_FAILED); + return -EBUSY; + } + + min_order = mapping_min_folio_order(folio->mapping); +out: + return split_huge_page_to_list_to_order(&folio->page, list, + min_order); +} + void __folio_undo_large_rmappable(struct folio *folio) { struct deferred_split *ds_queue; @@ -3515,6 +3546,8 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, struct vm_area_struct *vma = vma_lookup(mm, addr); struct page *page; struct folio *folio; + struct address_space *mapping; + unsigned int target_order = new_order; if (!vma) break; @@ -3535,7 +3568,13 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, if (!is_transparent_hugepage(folio)) goto next; - if (new_order >= folio_order(folio)) + if (!folio_test_anon(folio)) { + mapping = folio->mapping; + target_order = max(new_order, + mapping_min_folio_order(mapping)); + } + + if (target_order >= folio_order(folio)) goto next; total++; @@ -3551,9 +3590,14 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, if (!folio_trylock(folio)) goto next; - if (!split_folio_to_order(folio, new_order)) + if (!folio_test_anon(folio) && folio->mapping != mapping) + goto unlock; + + if (!split_folio_to_order(folio, target_order)) split++; +unlock: + folio_unlock(folio); next: folio_put(folio); @@ -3578,6 +3622,8 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, pgoff_t index; int nr_pages = 1; unsigned long total = 0, split = 0; + unsigned int min_order; + unsigned int target_order; file = getname_kernel(file_path); if (IS_ERR(file)) @@ -3591,6 +3637,8 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, file_path, off_start, off_end); mapping = candidate->f_mapping; + min_order = mapping_min_folio_order(mapping); + target_order = max(new_order, min_order); for (index = off_start; index < off_end; index += nr_pages) { struct folio *folio = filemap_get_folio(mapping, index); @@ -3605,15 +3653,19 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, total++; nr_pages = folio_nr_pages(folio); - if (new_order >= folio_order(folio)) + if (target_order >= folio_order(folio)) goto next; if (!folio_trylock(folio)) goto next; - if (!split_folio_to_order(folio, new_order)) + if (folio->mapping != mapping) + goto unlock; + + if (!split_folio_to_order(folio, target_order)) split++; +unlock: folio_unlock(folio); next: folio_put(folio); From e9f3b433acd0558a06e287f840b41add1d55df89 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Thu, 22 Aug 2024 15:50:13 +0200 Subject: [PATCH 128/168] filemap: cap PTE range to be created to allowed zero fill in folio_map_range() Usually the page cache does not extend beyond the size of the inode, therefore, no PTEs are created for folios that extend beyond the size. But with LBS support, we might extend page cache beyond the size of the inode as we need to guarantee folios of minimum order. While doing a read, do_fault_around() can create PTEs for pages that lie beyond the EOF leading to incorrect error return when accessing a page beyond the mapped file. Cap the PTE range to be created for the page cache up to the end of file(EOF) in filemap_map_pages() so that return error codes are consistent with POSIX[1] for LBS configurations. generic/749 has been created to trigger this edge case. This also fixes generic/749 for tmpfs with huge=always on systems with 4k base page size. [1](from mmap(2)) SIGBUS Attempted access to a page of the buffer that lies beyond the end of the mapped file. For an explanation of the treatment of the bytes in the page that corresponds to the end of a mapped file that is not a multiple of the page size, see NOTES. Signed-off-by: Luis Chamberlain Signed-off-by: Pankaj Raghav Link: https://lore.kernel.org/r/20240822135018.1931258-6-kernel@pankajraghav.com Tested-by: David Howells Reviewed-by: Hannes Reinecke Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- mm/filemap.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index d27e9ac54309..d32210927453 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3608,7 +3608,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, struct vm_area_struct *vma = vmf->vma; struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; - pgoff_t last_pgoff = start_pgoff; + pgoff_t file_end, last_pgoff = start_pgoff; unsigned long addr; XA_STATE(xas, &mapping->i_pages, start_pgoff); struct folio *folio; @@ -3634,6 +3634,10 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, goto out; } + file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1; + if (end_pgoff > file_end) + end_pgoff = file_end; + folio_type = mm_counter_file(folio); do { unsigned long end; From d940b3b7b76b409b0550fdf2de6dc2183f01526f Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Thu, 22 Aug 2024 15:50:14 +0200 Subject: [PATCH 129/168] iomap: fix iomap_dio_zero() for fs bs > system page size iomap_dio_zero() will pad a fs block with zeroes if the direct IO size < fs block size. iomap_dio_zero() has an implicit assumption that fs block size < page_size. This is true for most filesystems at the moment. If the block size > page size, this will send the contents of the page next to zero page(as len > PAGE_SIZE) to the underlying block device, causing FS corruption. iomap is a generic infrastructure and it should not make any assumptions about the fs block size and the page size of the system. Signed-off-by: Pankaj Raghav Link: https://lore.kernel.org/r/20240822135018.1931258-7-kernel@pankajraghav.com Reviewed-by: Hannes Reinecke Reviewed-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 4 ++-- fs/iomap/direct-io.c | 45 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 41 insertions(+), 8 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index f420c53d86ac..d745f718bcde 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -2007,10 +2007,10 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc, } EXPORT_SYMBOL_GPL(iomap_writepages); -static int __init iomap_init(void) +static int __init iomap_buffered_init(void) { return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), offsetof(struct iomap_ioend, io_bio), BIOSET_NEED_BVECS); } -fs_initcall(iomap_init); +fs_initcall(iomap_buffered_init); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index f3b43d223a46..c02b266bba52 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "trace.h" @@ -27,6 +28,13 @@ #define IOMAP_DIO_WRITE (1U << 30) #define IOMAP_DIO_DIRTY (1U << 31) +/* + * Used for sub block zeroing in iomap_dio_zero() + */ +#define IOMAP_ZERO_PAGE_SIZE (SZ_64K) +#define IOMAP_ZERO_PAGE_ORDER (get_order(IOMAP_ZERO_PAGE_SIZE)) +static struct page *zero_page; + struct iomap_dio { struct kiocb *iocb; const struct iomap_dio_ops *dops; @@ -232,13 +240,20 @@ release_bio: } EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io); -static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, +static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, loff_t pos, unsigned len) { struct inode *inode = file_inode(dio->iocb->ki_filp); - struct page *page = ZERO_PAGE(0); struct bio *bio; + if (!len) + return 0; + /* + * Max block size supported is 64k + */ + if (WARN_ON_ONCE(len > IOMAP_ZERO_PAGE_SIZE)) + return -EINVAL; + bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, GFP_KERNEL); @@ -246,8 +261,9 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; - __bio_add_page(bio, page, len, 0); + __bio_add_page(bio, zero_page, len, 0); iomap_dio_submit_bio(iter, dio, bio, pos); + return 0; } /* @@ -356,8 +372,10 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, if (need_zeroout) { /* zero out from the start of the block to the write offset */ pad = pos & (fs_block_size - 1); - if (pad) - iomap_dio_zero(iter, dio, pos - pad, pad); + + ret = iomap_dio_zero(iter, dio, pos - pad, pad); + if (ret) + goto out; } /* @@ -431,7 +449,8 @@ zero_tail: /* zero out from the end of the write to the end of the block */ pad = pos & (fs_block_size - 1); if (pad) - iomap_dio_zero(iter, dio, pos, fs_block_size - pad); + ret = iomap_dio_zero(iter, dio, pos, + fs_block_size - pad); } out: /* Undo iter limitation to current extent */ @@ -753,3 +772,17 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, return iomap_dio_complete(dio); } EXPORT_SYMBOL_GPL(iomap_dio_rw); + +static int __init iomap_dio_init(void) +{ + zero_page = alloc_pages(GFP_KERNEL | __GFP_ZERO, + IOMAP_ZERO_PAGE_ORDER); + + if (!zero_page) + return -ENOMEM; + + set_memory_ro((unsigned long)page_address(zero_page), + 1U << IOMAP_ZERO_PAGE_ORDER); + return 0; +} +fs_initcall(iomap_dio_init); From 13c9f3c68405d90300ec3eb341d1378e831dd565 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 22 Aug 2024 15:50:15 +0200 Subject: [PATCH 130/168] xfs: use kvmalloc for xattr buffers Pankaj Raghav reported that when filesystem block size is larger than page size, the xattr code can use kmalloc() for high order allocations. This triggers a useless warning in the allocator as it is a __GFP_NOFAIL allocation here: static inline struct page *rmqueue(struct zone *preferred_zone, struct zone *zone, unsigned int order, gfp_t gfp_flags, unsigned int alloc_flags, int migratetype) { struct page *page; /* * We most definitely don't want callers attempting to * allocate greater than order-1 page units with __GFP_NOFAIL. */ >>>> WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); ... Fix this by changing all these call sites to use kvmalloc(), which will strip the NOFAIL from the kmalloc attempt and if that fails will do a __GFP_NOFAIL vmalloc(). This is not an issue that productions systems will see as filesystems with block size > page size cannot be mounted by the kernel; Pankaj is developing this functionality right now. Reported-by: Pankaj Raghav Fixes: f078d4ea8276 ("xfs: convert kmem_alloc() to kmalloc()") Signed-off-by: Dave Chinner Link: https://lore.kernel.org/r/20240822135018.1931258-8-kernel@pankajraghav.com Reviewed-by: Darrick J. Wong Reviewed-by: Pankaj Raghav Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/xfs/libxfs/xfs_attr_leaf.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index b9e98950eb3d..09f4cb061a6e 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -1138,10 +1138,7 @@ xfs_attr3_leaf_to_shortform( trace_xfs_attr_leaf_to_sf(args); - tmpbuffer = kmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL); - if (!tmpbuffer) - return -ENOMEM; - + tmpbuffer = kvmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL); memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); leaf = (xfs_attr_leafblock_t *)tmpbuffer; @@ -1205,7 +1202,7 @@ xfs_attr3_leaf_to_shortform( error = 0; out: - kfree(tmpbuffer); + kvfree(tmpbuffer); return error; } @@ -1613,7 +1610,7 @@ xfs_attr3_leaf_compact( trace_xfs_attr_leaf_compact(args); - tmpbuffer = kmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL); + tmpbuffer = kvmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL); memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); memset(bp->b_addr, 0, args->geo->blksize); leaf_src = (xfs_attr_leafblock_t *)tmpbuffer; @@ -1651,7 +1648,7 @@ xfs_attr3_leaf_compact( */ xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1); - kfree(tmpbuffer); + kvfree(tmpbuffer); } /* @@ -2330,7 +2327,7 @@ xfs_attr3_leaf_unbalance( struct xfs_attr_leafblock *tmp_leaf; struct xfs_attr3_icleaf_hdr tmphdr; - tmp_leaf = kzalloc(state->args->geo->blksize, + tmp_leaf = kvzalloc(state->args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL); /* @@ -2371,7 +2368,7 @@ xfs_attr3_leaf_unbalance( } memcpy(save_leaf, tmp_leaf, state->args->geo->blksize); savehdr = tmphdr; /* struct copy */ - kfree(tmp_leaf); + kvfree(tmp_leaf); } xfs_attr3_leaf_hdr_to_disk(state->args->geo, save_leaf, &savehdr); From 4e70eedd93aed67b6190aac3439dba9343acff41 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Thu, 22 Aug 2024 15:50:16 +0200 Subject: [PATCH 131/168] xfs: expose block size in stat For block size larger than page size, the unit of efficient IO is the block size, not the page size. Leaving stat() to report PAGE_SIZE as the block size causes test programs like fsx to issue illegal ranges for operations that require block size alignment (e.g. fallocate() insert range). Hence update the preferred IO size to reflect the block size in this case. This change is based on a patch originally from Dave Chinner.[1] [1] https://lwn.net/ml/linux-fsdevel/20181107063127.3902-16-david@fromorbit.com/ Signed-off-by: Pankaj Raghav Signed-off-by: Luis Chamberlain Link: https://lore.kernel.org/r/20240822135018.1931258-9-kernel@pankajraghav.com Reviewed-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Hannes Reinecke Signed-off-by: Christian Brauner --- fs/xfs/xfs_iops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 1cdc8034f54d..6483b4c4cf35 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -567,7 +567,7 @@ xfs_stat_blksize( return 1U << mp->m_allocsize_log; } - return PAGE_SIZE; + return max_t(uint32_t, PAGE_SIZE, mp->m_sb.sb_blocksize); } STATIC int From f8b794f5072513061068b17e6bab5c64ab6c9fae Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Thu, 22 Aug 2024 15:50:17 +0200 Subject: [PATCH 132/168] xfs: make the calculation generic in xfs_sb_validate_fsb_count() Instead of assuming that PAGE_SHIFT is always higher than the blocklog, make the calculation generic so that page cache count can be calculated correctly for LBS. Signed-off-by: Pankaj Raghav Link: https://lore.kernel.org/r/20240822135018.1931258-10-kernel@pankajraghav.com Reviewed-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Christian Brauner --- fs/xfs/xfs_mount.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 09eef1721ef4..3949f720b535 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -132,11 +132,16 @@ xfs_sb_validate_fsb_count( xfs_sb_t *sbp, uint64_t nblocks) { + uint64_t max_bytes; + ASSERT(PAGE_SHIFT >= sbp->sb_blocklog); ASSERT(sbp->sb_blocklog >= BBSHIFT); + if (check_shl_overflow(nblocks, sbp->sb_blocklog, &max_bytes)) + return -EFBIG; + /* Limited by ULONG_MAX of page cache index */ - if (nblocks >> (PAGE_SHIFT - sbp->sb_blocklog) > ULONG_MAX) + if (max_bytes >> PAGE_SHIFT > ULONG_MAX) return -EFBIG; return 0; } From 0ab3ca31b0124a5ae0574542f547a7d61b00098d Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Thu, 22 Aug 2024 15:50:18 +0200 Subject: [PATCH 133/168] xfs: enable block size larger than page size support Page cache now has the ability to have a minimum order when allocating a folio which is a prerequisite to add support for block size > page size. Signed-off-by: Pankaj Raghav Signed-off-by: Luis Chamberlain Link: https://lore.kernel.org/r/20240822135018.1931258-11-kernel@pankajraghav.com Reviewed-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Christian Brauner --- fs/xfs/libxfs/xfs_ialloc.c | 5 +++++ fs/xfs/libxfs/xfs_shared.h | 3 +++ fs/xfs/xfs_icache.c | 6 ++++-- fs/xfs/xfs_mount.c | 1 - fs/xfs/xfs_super.c | 28 ++++++++++++++++++++-------- include/linux/pagemap.h | 13 +++++++++++++ 6 files changed, 45 insertions(+), 11 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 0af5b7a33d05..1921b689888b 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -3033,6 +3033,11 @@ xfs_ialloc_setup_geometry( igeo->ialloc_align = mp->m_dalign; else igeo->ialloc_align = 0; + + if (mp->m_sb.sb_blocksize > PAGE_SIZE) + igeo->min_folio_order = mp->m_sb.sb_blocklog - PAGE_SHIFT; + else + igeo->min_folio_order = 0; } /* Compute the location of the root directory inode that is laid out by mkfs. */ diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 2f7413afbf46..33b84a3a83ff 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -224,6 +224,9 @@ struct xfs_ino_geometry { /* precomputed value for di_flags2 */ uint64_t new_diflags2; + /* minimum folio order of a page cache allocation */ + unsigned int min_folio_order; + }; #endif /* __XFS_SHARED_H__ */ diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index cf629302d48e..0fcf235e5023 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -88,7 +88,8 @@ xfs_inode_alloc( /* VFS doesn't initialise i_mode! */ VFS_I(ip)->i_mode = 0; - mapping_set_large_folios(VFS_I(ip)->i_mapping); + mapping_set_folio_min_order(VFS_I(ip)->i_mapping, + M_IGEO(mp)->min_folio_order); XFS_STATS_INC(mp, vn_active); ASSERT(atomic_read(&ip->i_pincount) == 0); @@ -325,7 +326,8 @@ xfs_reinit_inode( inode->i_uid = uid; inode->i_gid = gid; inode->i_state = state; - mapping_set_large_folios(inode->i_mapping); + mapping_set_folio_min_order(inode->i_mapping, + M_IGEO(mp)->min_folio_order); return error; } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 3949f720b535..c6933440f806 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -134,7 +134,6 @@ xfs_sb_validate_fsb_count( { uint64_t max_bytes; - ASSERT(PAGE_SHIFT >= sbp->sb_blocklog); ASSERT(sbp->sb_blocklog >= BBSHIFT); if (check_shl_overflow(nblocks, sbp->sb_blocklog, &max_bytes)) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 27e9f749c4c7..242271298a33 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1638,16 +1638,28 @@ xfs_fs_fill_super( goto out_free_sb; } - /* - * Until this is fixed only page-sized or smaller data blocks work. - */ if (mp->m_sb.sb_blocksize > PAGE_SIZE) { - xfs_warn(mp, - "File system with blocksize %d bytes. " - "Only pagesize (%ld) or less will currently work.", + size_t max_folio_size = mapping_max_folio_size_supported(); + + if (!xfs_has_crc(mp)) { + xfs_warn(mp, +"V4 Filesystem with blocksize %d bytes. Only pagesize (%ld) or less is supported.", mp->m_sb.sb_blocksize, PAGE_SIZE); - error = -ENOSYS; - goto out_free_sb; + error = -ENOSYS; + goto out_free_sb; + } + + if (mp->m_sb.sb_blocksize > max_folio_size) { + xfs_warn(mp, +"block size (%u bytes) not supported; Only block size (%ld) or less is supported", + mp->m_sb.sb_blocksize, max_folio_size); + error = -ENOSYS; + goto out_free_sb; + } + + xfs_warn(mp, +"EXPERIMENTAL: V5 Filesystem with Large Block Size (%d bytes) enabled.", + mp->m_sb.sb_blocksize); } /* Ensure this filesystem fits in the page cache limits */ diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 4cc170949e9c..55b254d951da 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -374,6 +374,19 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) #define MAX_XAS_ORDER (XA_CHUNK_SHIFT * 2 - 1) #define MAX_PAGECACHE_ORDER min(MAX_XAS_ORDER, PREFERRED_MAX_PAGECACHE_ORDER) +/* + * mapping_max_folio_size_supported() - Check the max folio size supported + * + * The filesystem should call this function at mount time if there is a + * requirement on the folio mapping size in the page cache. + */ +static inline size_t mapping_max_folio_size_supported(void) +{ + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + return 1U << (PAGE_SHIFT + MAX_PAGECACHE_ORDER); + return PAGE_SIZE; +} + /* * mapping_set_folio_order_range() - Set the orders supported by a file. * @mapping: The address space of the file. From 51eac77be01be978bfd9aeed40981a5094beb1f1 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Mon, 26 Aug 2024 14:26:32 -0700 Subject: [PATCH 134/168] iomap: remove set_memor_ro() on zero page Stephen reported a boot failure on ppc power8 system where set_memor_ro() on the new zero page failed [0]. Christophe Leroy further clarifies we can't use this on on linear memory on ppc, and so instead of special casing this just for PowerPC [2] remove the call as suggested by Darrick. [0] https://lore.kernel.org/all/20240826175931.1989f99e@canb.auug.org.au/T/#u [1] https://lore.kernel.org/all/b0fe75b4-c1bb-47f7-a7c3-2534b31c1780@csgroup.eu/ [2] https://lore.kernel.org/all/ZszrJkFOpiy5rCma@bombadil.infradead.org/ Reported-by: Stephen Rothwell Suggested-by: Darrick J. Wong Signed-off-by: Luis Chamberlain Link: https://lore.kernel.org/r/20240826212632.2098685-1-mcgrof@kernel.org Tested-by: Stephen Rothwell Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/iomap/direct-io.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index c02b266bba52..f637aa0706a3 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include "trace.h" @@ -781,8 +780,6 @@ static int __init iomap_dio_init(void) if (!zero_page) return -ENOMEM; - set_memory_ro((unsigned long)page_address(zero_page), - 1U << IOMAP_ZERO_PAGE_ORDER); return 0; } fs_initcall(iomap_dio_init); From f143d1a48d6ecce12f5bced0d18a10a0294726b5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 27 Aug 2024 06:51:36 -0400 Subject: [PATCH 135/168] iomap: add a private argument for iomap_file_buffered_write In order to switch fuse over to using iomap for buffered writes we need to be able to have the struct file for the original write, in case we have to read in the page to make it uptodate. Handle this by using the existing private field in the iomap_iter, and add the argument to iomap_file_buffered_write. This will allow us to pass the file in through the iomap buffered write path, and is flexible for any other file systems needs. Signed-off-by: Josef Bacik Link: https://lore.kernel.org/r/7f55c7c32275004ba00cddf862d970e6e633f750.1724755651.git.josef@toxicpanda.com Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- block/fops.c | 2 +- fs/gfs2/file.c | 2 +- fs/iomap/buffered-io.c | 3 ++- fs/xfs/xfs_file.c | 2 +- fs/zonefs/file.c | 2 +- include/linux/iomap.h | 2 +- 6 files changed, 7 insertions(+), 6 deletions(-) diff --git a/block/fops.c b/block/fops.c index 9825c1713a49..d16a6dddb12a 100644 --- a/block/fops.c +++ b/block/fops.c @@ -665,7 +665,7 @@ blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from) static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from) { - return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops); + return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops, NULL); } /* diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 08982937b5df..f7dd64856c9b 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1057,7 +1057,7 @@ retry: } pagefault_disable(); - ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); + ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops, NULL); pagefault_enable(); if (ret > 0) written += ret; diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index d745f718bcde..e79d11701553 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1022,13 +1022,14 @@ retry: ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, - const struct iomap_ops *ops) + const struct iomap_ops *ops, void *private) { struct iomap_iter iter = { .inode = iocb->ki_filp->f_mapping->host, .pos = iocb->ki_pos, .len = iov_iter_count(i), .flags = IOMAP_WRITE, + .private = private, }; ssize_t ret; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 4cdc54dc9686..e9c693bb20bc 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -760,7 +760,7 @@ write_retry: trace_xfs_file_buffered_write(iocb, from); ret = iomap_file_buffered_write(iocb, from, - &xfs_buffered_write_iomap_ops); + &xfs_buffered_write_iomap_ops, NULL); /* * If we hit a space limit, try to free up some lingering preallocated diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 3b103715acc9..35166c92420c 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -563,7 +563,7 @@ static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, if (ret <= 0) goto inode_unlock; - ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops); + ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops, NULL); if (ret == -EIO) zonefs_io_error(inode, true); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 6fc1c858013d..f792b37f7627 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -257,7 +257,7 @@ static inline const struct iomap *iomap_iter_srcmap(const struct iomap_iter *i) } ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, - const struct iomap_ops *ops); + const struct iomap_ops *ops, void *private); int iomap_file_buffered_write_punch_delalloc(struct inode *inode, struct iomap *iomap, loff_t pos, loff_t length, ssize_t written, int (*punch)(struct inode *inode, loff_t pos, loff_t length)); From 1934b212615dc617ac84fc306333ab2b9fc3b04f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 9 Aug 2024 18:00:01 +0200 Subject: [PATCH 136/168] file: reclaim 24 bytes from f_owner We do embedd struct fown_struct into struct file letting it take up 32 bytes in total. We could tweak struct fown_struct to be more compact but really it shouldn't even be embedded in struct file in the first place. Instead, actual users of struct fown_struct should allocate the struct on demand. This frees up 24 bytes in struct file. That will have some potentially user-visible changes for the ownership fcntl()s. Some of them can now fail due to allocation failures. Practically, that probably will almost never happen as the allocations are small and they only happen once per file. The fown_struct is used during kill_fasync() which is used by e.g., pipes to generate a SIGIO signal. Sending of such signals is conditional on userspace having set an owner for the file using one of the F_OWNER fcntl()s. Such users will be unaffected if struct fown_struct is allocated during the fcntl() call. There are a few subsystems that call __f_setown() expecting file->f_owner to be allocated: (1) tun devices file->f_op->fasync::tun_chr_fasync() -> __f_setown() There are no callers of tun_chr_fasync(). (2) tty devices file->f_op->fasync::tty_fasync() -> __tty_fasync() -> __f_setown() tty_fasync() has no additional callers but __tty_fasync() has. Note that __tty_fasync() only calls __f_setown() if the @on argument is true. It's called from: file->f_op->release::tty_release() -> tty_release() -> __tty_fasync() -> __f_setown() tty_release() calls __tty_fasync() with @on false => __f_setown() is never called from tty_release(). => All callers of tty_release() are safe as well. file->f_op->release::tty_open() -> tty_release() -> __tty_fasync() -> __f_setown() __tty_hangup() calls __tty_fasync() with @on false => __f_setown() is never called from tty_release(). => All callers of __tty_hangup() are safe as well. From the callchains it's obvious that (1) and (2) end up getting called via file->f_op->fasync(). That can happen either through the F_SETFL fcntl() with the FASYNC flag raised or via the FIOASYNC ioctl(). If FASYNC is requested and the file isn't already FASYNC then file->f_op->fasync() is called with @on true which ends up causing both (1) and (2) to call __f_setown(). (1) and (2) are the only subsystems that call __f_setown() from the file->f_op->fasync() handler. So both (1) and (2) have been updated to allocate a struct fown_struct prior to calling fasync_helper() to register with the fasync infrastructure. That's safe as they both call fasync_helper() which also does allocations if @on is true. The other interesting case are file leases: (3) file leases lease_manager_ops->lm_setup::lease_setup() -> __f_setown() Which in turn is called from: generic_add_lease() -> lease_manager_ops->lm_setup::lease_setup() -> __f_setown() So here again we can simply make generic_add_lease() allocate struct fown_struct prior to the lease_manager_ops->lm_setup::lease_setup() which happens under a spinlock. With that the two remaining subsystems that call __f_setown() are: (4) dnotify (5) sockets Both have their own custom ioctls to set struct fown_struct and both have been converted to allocate a struct fown_struct on demand from their respective ioctls. Interactions with O_PATH are fine as well e.g., when opening a /dev/tty as O_PATH then no file->f_op->open() happens thus no file->f_owner is allocated. That's fine as no file operation will be set for those and the device has never been opened. fcntl()s called on such things will just allocate a ->f_owner on demand. Although I have zero idea why'd you care about f_owner on an O_PATH fd. Link: https://lore.kernel.org/r/20240813-work-f_owner-v2-1-4e9343a79f9f@kernel.org Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- drivers/net/tun.c | 6 ++ drivers/tty/tty_io.c | 6 ++ fs/fcntl.c | 166 ++++++++++++++++++++++++++++-------- fs/file_table.c | 3 +- fs/internal.h | 1 + fs/locks.c | 6 +- fs/notify/dnotify/dnotify.c | 6 +- include/linux/fs.h | 11 ++- net/core/sock.c | 2 +- security/selinux/hooks.c | 2 +- security/smack/smack_lsm.c | 2 +- 11 files changed, 168 insertions(+), 43 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 1d06c560c5e6..6fe5e8f7017c 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -3451,6 +3451,12 @@ static int tun_chr_fasync(int fd, struct file *file, int on) struct tun_file *tfile = file->private_data; int ret; + if (on) { + ret = file_f_owner_allocate(file); + if (ret) + goto out; + } + if ((ret = fasync_helper(fd, file, on, &tfile->fasync)) < 0) goto out; diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 407b0d87b7c1..7ae0c8934f42 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -2225,6 +2225,12 @@ static int __tty_fasync(int fd, struct file *filp, int on) if (tty_paranoia_check(tty, file_inode(filp), "tty_fasync")) goto out; + if (on) { + retval = file_f_owner_allocate(filp); + if (retval) + goto out; + } + retval = fasync_helper(fd, filp, on, &tty->fasync); if (retval <= 0) goto out; diff --git a/fs/fcntl.c b/fs/fcntl.c index 300e5d9ad913..0587a0e221a6 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -33,6 +33,8 @@ #include #include +#include "internal.h" + #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME) static int setfl(int fd, struct file * filp, unsigned int arg) @@ -87,22 +89,64 @@ static int setfl(int fd, struct file * filp, unsigned int arg) return error; } +/* + * Allocate an file->f_owner struct if it doesn't exist, handling racing + * allocations correctly. + */ +int file_f_owner_allocate(struct file *file) +{ + struct fown_struct *f_owner; + + f_owner = file_f_owner(file); + if (f_owner) + return 0; + + f_owner = kzalloc(sizeof(struct fown_struct), GFP_KERNEL); + if (!f_owner) + return -ENOMEM; + + rwlock_init(&f_owner->lock); + f_owner->file = file; + /* If someone else raced us, drop our allocation. */ + if (unlikely(cmpxchg(&file->f_owner, NULL, f_owner))) + kfree(f_owner); + return 0; +} +EXPORT_SYMBOL(file_f_owner_allocate); + +void file_f_owner_release(struct file *file) +{ + struct fown_struct *f_owner; + + f_owner = file_f_owner(file); + if (f_owner) { + put_pid(f_owner->pid); + kfree(f_owner); + } +} + static void f_modown(struct file *filp, struct pid *pid, enum pid_type type, int force) { - write_lock_irq(&filp->f_owner.lock); - if (force || !filp->f_owner.pid) { - put_pid(filp->f_owner.pid); - filp->f_owner.pid = get_pid(pid); - filp->f_owner.pid_type = type; + struct fown_struct *f_owner; + + f_owner = file_f_owner(filp); + if (WARN_ON_ONCE(!f_owner)) + return; + + write_lock_irq(&f_owner->lock); + if (force || !f_owner->pid) { + put_pid(f_owner->pid); + f_owner->pid = get_pid(pid); + f_owner->pid_type = type; if (pid) { const struct cred *cred = current_cred(); - filp->f_owner.uid = cred->uid; - filp->f_owner.euid = cred->euid; + f_owner->uid = cred->uid; + f_owner->euid = cred->euid; } } - write_unlock_irq(&filp->f_owner.lock); + write_unlock_irq(&f_owner->lock); } void __f_setown(struct file *filp, struct pid *pid, enum pid_type type, @@ -119,6 +163,8 @@ int f_setown(struct file *filp, int who, int force) struct pid *pid = NULL; int ret = 0; + might_sleep(); + type = PIDTYPE_TGID; if (who < 0) { /* avoid overflow below */ @@ -129,6 +175,10 @@ int f_setown(struct file *filp, int who, int force) who = -who; } + ret = file_f_owner_allocate(filp); + if (ret) + return ret; + rcu_read_lock(); if (who) { pid = find_vpid(who); @@ -152,16 +202,21 @@ void f_delown(struct file *filp) pid_t f_getown(struct file *filp) { pid_t pid = 0; + struct fown_struct *f_owner; - read_lock_irq(&filp->f_owner.lock); + f_owner = file_f_owner(filp); + if (!f_owner) + return pid; + + read_lock_irq(&f_owner->lock); rcu_read_lock(); - if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) { - pid = pid_vnr(filp->f_owner.pid); - if (filp->f_owner.pid_type == PIDTYPE_PGID) + if (pid_task(f_owner->pid, f_owner->pid_type)) { + pid = pid_vnr(f_owner->pid); + if (f_owner->pid_type == PIDTYPE_PGID) pid = -pid; } rcu_read_unlock(); - read_unlock_irq(&filp->f_owner.lock); + read_unlock_irq(&f_owner->lock); return pid; } @@ -194,6 +249,10 @@ static int f_setown_ex(struct file *filp, unsigned long arg) return -EINVAL; } + ret = file_f_owner_allocate(filp); + if (ret) + return ret; + rcu_read_lock(); pid = find_vpid(owner.pid); if (owner.pid && !pid) @@ -210,13 +269,20 @@ static int f_getown_ex(struct file *filp, unsigned long arg) struct f_owner_ex __user *owner_p = (void __user *)arg; struct f_owner_ex owner = {}; int ret = 0; + struct fown_struct *f_owner; + enum pid_type pid_type = PIDTYPE_PID; - read_lock_irq(&filp->f_owner.lock); - rcu_read_lock(); - if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) - owner.pid = pid_vnr(filp->f_owner.pid); - rcu_read_unlock(); - switch (filp->f_owner.pid_type) { + f_owner = file_f_owner(filp); + if (f_owner) { + read_lock_irq(&f_owner->lock); + rcu_read_lock(); + if (pid_task(f_owner->pid, f_owner->pid_type)) + owner.pid = pid_vnr(f_owner->pid); + rcu_read_unlock(); + pid_type = f_owner->pid_type; + } + + switch (pid_type) { case PIDTYPE_PID: owner.type = F_OWNER_TID; break; @@ -234,7 +300,8 @@ static int f_getown_ex(struct file *filp, unsigned long arg) ret = -EINVAL; break; } - read_unlock_irq(&filp->f_owner.lock); + if (f_owner) + read_unlock_irq(&f_owner->lock); if (!ret) { ret = copy_to_user(owner_p, &owner, sizeof(owner)); @@ -248,14 +315,18 @@ static int f_getown_ex(struct file *filp, unsigned long arg) static int f_getowner_uids(struct file *filp, unsigned long arg) { struct user_namespace *user_ns = current_user_ns(); + struct fown_struct *f_owner; uid_t __user *dst = (void __user *)arg; - uid_t src[2]; + uid_t src[2] = {0, 0}; int err; - read_lock_irq(&filp->f_owner.lock); - src[0] = from_kuid(user_ns, filp->f_owner.uid); - src[1] = from_kuid(user_ns, filp->f_owner.euid); - read_unlock_irq(&filp->f_owner.lock); + f_owner = file_f_owner(filp); + if (f_owner) { + read_lock_irq(&f_owner->lock); + src[0] = from_kuid(user_ns, f_owner->uid); + src[1] = from_kuid(user_ns, f_owner->euid); + read_unlock_irq(&f_owner->lock); + } err = put_user(src[0], &dst[0]); err |= put_user(src[1], &dst[1]); @@ -343,6 +414,30 @@ static long f_dupfd_query(int fd, struct file *filp) return f.file == filp; } +static int f_owner_sig(struct file *filp, int signum, bool setsig) +{ + int ret = 0; + struct fown_struct *f_owner; + + might_sleep(); + + if (setsig) { + if (!valid_signal(signum)) + return -EINVAL; + + ret = file_f_owner_allocate(filp); + if (ret) + return ret; + } + + f_owner = file_f_owner(filp); + if (setsig) + f_owner->signum = signum; + else if (f_owner) + ret = f_owner->signum; + return ret; +} + static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, struct file *filp) { @@ -421,15 +516,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, err = f_getowner_uids(filp, arg); break; case F_GETSIG: - err = filp->f_owner.signum; + err = f_owner_sig(filp, 0, false); break; case F_SETSIG: - /* arg == 0 restores default behaviour. */ - if (!valid_signal(argi)) { - break; - } - err = 0; - filp->f_owner.signum = argi; + err = f_owner_sig(filp, argi, true); break; case F_GETLEASE: err = fcntl_getlease(filp); @@ -844,14 +934,19 @@ static void send_sigurg_to_task(struct task_struct *p, do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, type); } -int send_sigurg(struct fown_struct *fown) +int send_sigurg(struct file *file) { + struct fown_struct *fown; struct task_struct *p; enum pid_type type; struct pid *pid; unsigned long flags; int ret = 0; + fown = file_f_owner(file); + if (!fown) + return 0; + read_lock_irqsave(&fown->lock, flags); type = fown->pid_type; @@ -1027,13 +1122,16 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) } read_lock_irqsave(&fa->fa_lock, flags); if (fa->fa_file) { - fown = &fa->fa_file->f_owner; + fown = file_f_owner(fa->fa_file); + if (!fown) + goto next; /* Don't send SIGURG to processes which have not set a queued signum: SIGURG has its own default signalling mechanism. */ if (!(sig == SIGURG && fown->signum == 0)) send_sigio(fown, fa->fa_fd, band); } +next: read_unlock_irqrestore(&fa->fa_lock, flags); fa = rcu_dereference(fa->fa_next); } diff --git a/fs/file_table.c b/fs/file_table.c index ca7843dde56d..fdf98709dde2 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -155,7 +155,6 @@ static int init_file(struct file *f, int flags, const struct cred *cred) return error; } - rwlock_init(&f->f_owner.lock); spin_lock_init(&f->f_lock); mutex_init(&f->f_pos_lock); f->f_flags = flags; @@ -425,7 +424,7 @@ static void __fput(struct file *file) cdev_put(inode->i_cdev); } fops_put(file->f_op); - put_pid(file->f_owner.pid); + file_f_owner_release(file); put_file_access(file); dput(dentry); if (unlikely(mode & FMODE_NEED_UNMOUNT)) diff --git a/fs/internal.h b/fs/internal.h index cdd73209eecb..8c1b7acbbe8f 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -337,3 +337,4 @@ static inline bool path_mounted(const struct path *path) { return path->mnt->mnt_root == path->dentry; } +void file_f_owner_release(struct file *file); diff --git a/fs/locks.c b/fs/locks.c index e45cad40f8b6..b51b1c395ce6 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1451,7 +1451,7 @@ int lease_modify(struct file_lease *fl, int arg, struct list_head *dispose) struct file *filp = fl->c.flc_file; f_delown(filp); - filp->f_owner.signum = 0; + file_f_owner(filp)->signum = 0; fasync_helper(0, fl->c.flc_file, 0, &fl->fl_fasync); if (fl->fl_fasync != NULL) { printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync); @@ -1783,6 +1783,10 @@ generic_add_lease(struct file *filp, int arg, struct file_lease **flp, void **pr lease = *flp; trace_generic_add_lease(inode, lease); + error = file_f_owner_allocate(filp); + if (error) + return error; + /* Note that arg is never F_UNLCK here */ ctx = locks_get_lock_context(inode, arg); if (!ctx) diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index f3669403fabf..46440fbb8662 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -110,7 +110,7 @@ static int dnotify_handle_event(struct fsnotify_mark *inode_mark, u32 mask, prev = &dn->dn_next; continue; } - fown = &dn->dn_filp->f_owner; + fown = file_f_owner(dn->dn_filp); send_sigio(fown, dn->dn_fd, POLL_MSG); if (dn->dn_mask & FS_DN_MULTISHOT) prev = &dn->dn_next; @@ -316,6 +316,10 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg) goto out_err; } + error = file_f_owner_allocate(filp); + if (error) + goto out_err; + /* set up the new_fsn_mark and new_dn_mark */ new_fsn_mark = &new_dn_mark->fsn_mark; fsnotify_init_mark(new_fsn_mark, dnotify_group); diff --git a/include/linux/fs.h b/include/linux/fs.h index fb0426f349fc..7af239ca87e2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -947,6 +947,7 @@ static inline unsigned imajor(const struct inode *inode) } struct fown_struct { + struct file *file; /* backpointer for security modules */ rwlock_t lock; /* protects pid, uid, euid fields */ struct pid *pid; /* pid or -pgrp where SIGIO should be sent */ enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */ @@ -1011,7 +1012,7 @@ struct file { struct mutex f_pos_lock; loff_t f_pos; unsigned int f_flags; - struct fown_struct f_owner; + struct fown_struct *f_owner; const struct cred *f_cred; struct file_ra_state f_ra; struct path f_path; @@ -1076,6 +1077,12 @@ struct file_lease; #define OFFT_OFFSET_MAX type_max(off_t) #endif +int file_f_owner_allocate(struct file *file); +static inline struct fown_struct *file_f_owner(const struct file *file) +{ + return READ_ONCE(file->f_owner); +} + extern void send_sigio(struct fown_struct *fown, int fd, int band); static inline struct inode *file_inode(const struct file *f) @@ -1124,7 +1131,7 @@ extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force extern int f_setown(struct file *filp, int who, int force); extern void f_delown(struct file *filp); extern pid_t f_getown(struct file *filp); -extern int send_sigurg(struct fown_struct *fown); +extern int send_sigurg(struct file *file); /* * sb->s_flags. Note that these mirror the equivalent MS_* flags where diff --git a/net/core/sock.c b/net/core/sock.c index 9abc4fe25953..bbe4c58470c3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3429,7 +3429,7 @@ static void sock_def_destruct(struct sock *sk) void sk_send_sigurg(struct sock *sk) { if (sk->sk_socket && sk->sk_socket->file) - if (send_sigurg(&sk->sk_socket->file->f_owner)) + if (send_sigurg(sk->sk_socket->file)) sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); } EXPORT_SYMBOL(sk_send_sigurg); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index bfa61e005aac..577e1fcbf6df 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3950,7 +3950,7 @@ static int selinux_file_send_sigiotask(struct task_struct *tsk, struct file_security_struct *fsec; /* struct fown_struct is never outside the context of a struct file */ - file = container_of(fown, struct file, f_owner); + file = fown->file; fsec = selinux_file(file); diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 4164699cd4f6..cb33920ab67c 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -1950,7 +1950,7 @@ static int smack_file_send_sigiotask(struct task_struct *tsk, /* * struct fown_struct is never outside the context of a struct file */ - file = container_of(fown, struct file, f_owner); + file = fown->file; /* we don't log here as rc can be overriden */ blob = smack_file(file); From a55d1cbd1720679cfe9837bce250e397ec513989 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Aug 2024 16:14:46 +0200 Subject: [PATCH 137/168] fs: switch f_iocb_flags and f_ra Now that we shrank struct file by 24 bytes we still have a 4 byte hole. If we move struct file_ra_state into the union and f_iocb_flags out of the union we close that whole and bring down struct file to 192 bytes. Which means struct file is 3 cachelines and we managed to shrink it by 40 bytes this cycle. I've tried to audit all codepaths that use f_ra and none of them seem to rely on it in file->f_op->release() and never have since commit 1da177e4c3f4 ("Linux-2.6.12-rc2"). Link: https://lore.kernel.org/r/20240823-luftdicht-berappen-d69a2166a0db@brauner Reviewed-by: Jeff Layton Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/fs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 7af239ca87e2..095a956aeb29 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -999,9 +999,9 @@ struct file { struct callback_head f_task_work; /* fput() must use workqueue (most kernel threads). */ struct llist_node f_llist; - unsigned int f_iocb_flags; + /* Invalid after last fput(). */ + struct file_ra_state f_ra; }; - /* * Protects f_ep, f_flags. * Must not be taken from IRQ context. @@ -1012,9 +1012,9 @@ struct file { struct mutex f_pos_lock; loff_t f_pos; unsigned int f_flags; + unsigned int f_iocb_flags; struct fown_struct *f_owner; const struct cred *f_cred; - struct file_ra_state f_ra; struct path f_path; struct inode *f_inode; /* cached value */ const struct file_operations *f_op; From a47454ef70c28706a83841f39f424730e57f2e4d Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 23 Aug 2024 21:06:58 +0200 Subject: [PATCH 138/168] fs: pack struct file Now that we shrunk struct file to 192 bytes aka 3 cachelines reorder struct file to not leave any holes or have members cross cachelines. Add a short comment to each of the fields and mark the cachelines. It's possible that we may have to tweak this based on profiling in the future. So far I had Jens test this comparing io_uring with non-fixed and fixed files and it improved performance. The layout is a combination of Jens' and my changes. Link: https: //lore.kernel.org/r/20240824-peinigen-hocken-7384b977c643@brauner Signed-off-by: Christian Brauner --- include/linux/fs.h | 91 ++++++++++++++++++++++++++-------------------- 1 file changed, 51 insertions(+), 40 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 095a956aeb29..6b9418f46621 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -987,52 +987,63 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) index < ra->start + ra->size); } -/* - * f_{lock,count,pos_lock} members can be highly contended and share - * the same cacheline. f_{lock,mode} are very frequently used together - * and so share the same cacheline as well. The read-mostly - * f_{path,inode,op} are kept on a separate cacheline. +/** + * struct file - Represents a file + * @f_count: reference count + * @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context. + * @f_mode: FMODE_* flags often used in hotpaths + * @f_op: file operations + * @f_mapping: Contents of a cacheable, mappable object. + * @private_data: filesystem or driver specific data + * @f_inode: cached inode + * @f_flags: file flags + * @f_iocb_flags: iocb flags + * @f_cred: stashed credentials of creator/opener + * @f_path: path of the file + * @f_pos_lock: lock protecting file position + * @f_pos: file position + * @f_version: file version + * @f_security: LSM security context of this file + * @f_owner: file owner + * @f_wb_err: writeback error + * @f_sb_err: per sb writeback errors + * @f_ep: link of all epoll hooks for this file + * @f_task_work: task work entry point + * @f_llist: work queue entrypoint + * @f_ra: file's readahead state */ struct file { + atomic_long_t f_count; + spinlock_t f_lock; + fmode_t f_mode; + const struct file_operations *f_op; + struct address_space *f_mapping; + void *private_data; + struct inode *f_inode; + unsigned int f_flags; + unsigned int f_iocb_flags; + const struct cred *f_cred; + /* --- cacheline 1 boundary (64 bytes) --- */ + struct path f_path; + struct mutex f_pos_lock; + loff_t f_pos; + u64 f_version; +#ifdef CONFIG_SECURITY + void *f_security; +#endif + /* --- cacheline 2 boundary (128 bytes) --- */ + struct fown_struct *f_owner; + errseq_t f_wb_err; + errseq_t f_sb_err; +#ifdef CONFIG_EPOLL + struct hlist_head *f_ep; +#endif union { - /* fput() uses task work when closing and freeing file (default). */ - struct callback_head f_task_work; - /* fput() must use workqueue (most kernel threads). */ + struct callback_head f_task_work; struct llist_node f_llist; - /* Invalid after last fput(). */ struct file_ra_state f_ra; }; - /* - * Protects f_ep, f_flags. - * Must not be taken from IRQ context. - */ - spinlock_t f_lock; - fmode_t f_mode; - atomic_long_t f_count; - struct mutex f_pos_lock; - loff_t f_pos; - unsigned int f_flags; - unsigned int f_iocb_flags; - struct fown_struct *f_owner; - const struct cred *f_cred; - struct path f_path; - struct inode *f_inode; /* cached value */ - const struct file_operations *f_op; - - u64 f_version; -#ifdef CONFIG_SECURITY - void *f_security; -#endif - /* needed for tty driver, and maybe others */ - void *private_data; - -#ifdef CONFIG_EPOLL - /* Used by fs/eventpoll.c to link all the hooks to this file */ - struct hlist_head *f_ep; -#endif /* #ifdef CONFIG_EPOLL */ - struct address_space *f_mapping; - errseq_t f_wb_err; - errseq_t f_sb_err; /* for syncfs */ + /* --- cacheline 3 boundary (192 bytes) --- */ } __randomize_layout __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ From f0d2ed325b5d34d8fe0d618ae7e933a87f0fa40a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 28 Aug 2024 12:56:23 +0200 Subject: [PATCH 139/168] mm: remove unused root_cache argument That argument is unused so remove it. Link: https://lore.kernel.org/r/20240828-work-kmem_cache-rcu-v3-1-5460bc1f09f6@kernel.org Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner --- mm/slab_common.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 40b582a014b8..c8dd7e08c5f6 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -204,8 +204,7 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, static struct kmem_cache *create_cache(const char *name, unsigned int object_size, unsigned int align, slab_flags_t flags, unsigned int useroffset, - unsigned int usersize, void (*ctor)(void *), - struct kmem_cache *root_cache) + unsigned int usersize, void (*ctor)(void *)) { struct kmem_cache *s; int err; @@ -334,7 +333,7 @@ kmem_cache_create_usercopy(const char *name, s = create_cache(cache_name, size, calculate_alignment(flags, align, size), - flags, useroffset, usersize, ctor, NULL); + flags, useroffset, usersize, ctor); if (IS_ERR(s)) { err = PTR_ERR(s); kfree_const(cache_name); From 18dc9b3951080f2aefb63112b99cd6c95357a8d8 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 28 Aug 2024 12:56:24 +0200 Subject: [PATCH 140/168] mm: add kmem_cache_create_rcu() When a kmem cache is created with SLAB_TYPESAFE_BY_RCU the free pointer must be located outside of the object because we don't know what part of the memory can safely be overwritten as it may be needed to prevent object recycling. That has the consequence that SLAB_TYPESAFE_BY_RCU may end up adding a new cacheline. This is the case for e.g., struct file. After having it shrunk down by 40 bytes and having it fit in three cachelines we still have SLAB_TYPESAFE_BY_RCU adding a fourth cacheline because it needs to accommodate the free pointer. Add a new kmem_cache_create_rcu() function that allows the caller to specify an offset where the free pointer is supposed to be placed. Link: https://lore.kernel.org/r/20240828-work-kmem_cache-rcu-v3-2-5460bc1f09f6@kernel.org Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner --- include/linux/slab.h | 9 +++ mm/slab.h | 2 + mm/slab_common.c | 136 ++++++++++++++++++++++++++++++------------- mm/slub.c | 20 ++++--- 4 files changed, 121 insertions(+), 46 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index eb2bf4629157..5b2da2cf31a8 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -212,6 +212,12 @@ enum _slab_flag_bits { #define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED #endif +/* + * freeptr_t represents a SLUB freelist pointer, which might be encoded + * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. + */ +typedef struct { unsigned long v; } freeptr_t; + /* * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. * @@ -242,6 +248,9 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name, slab_flags_t flags, unsigned int useroffset, unsigned int usersize, void (*ctor)(void *)); +struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size, + unsigned int freeptr_offset, + slab_flags_t flags); void kmem_cache_destroy(struct kmem_cache *s); int kmem_cache_shrink(struct kmem_cache *s); diff --git a/mm/slab.h b/mm/slab.h index dcdb56b8e7f5..a6051385186e 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -261,6 +261,8 @@ struct kmem_cache { unsigned int object_size; /* Object size without metadata */ struct reciprocal_value reciprocal_size; unsigned int offset; /* Free pointer offset */ + /* Specific free pointer requested (if not UINT_MAX) */ + unsigned int rcu_freeptr_offset; #ifdef CONFIG_SLUB_CPU_PARTIAL /* Number of per cpu partial objects to keep around */ unsigned int cpu_partial; diff --git a/mm/slab_common.c b/mm/slab_common.c index c8dd7e08c5f6..887f6b9855dd 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -202,9 +202,10 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, } static struct kmem_cache *create_cache(const char *name, - unsigned int object_size, unsigned int align, - slab_flags_t flags, unsigned int useroffset, - unsigned int usersize, void (*ctor)(void *)) + unsigned int object_size, unsigned int freeptr_offset, + unsigned int align, slab_flags_t flags, + unsigned int useroffset, unsigned int usersize, + void (*ctor)(void *)) { struct kmem_cache *s; int err; @@ -212,6 +213,13 @@ static struct kmem_cache *create_cache(const char *name, if (WARN_ON(useroffset + usersize > object_size)) useroffset = usersize = 0; + /* If a custom freelist pointer is requested make sure it's sane. */ + err = -EINVAL; + if (freeptr_offset != UINT_MAX && + (freeptr_offset >= object_size || !(flags & SLAB_TYPESAFE_BY_RCU) || + !IS_ALIGNED(freeptr_offset, sizeof(freeptr_t)))) + goto out; + err = -ENOMEM; s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); if (!s) @@ -219,13 +227,13 @@ static struct kmem_cache *create_cache(const char *name, s->name = name; s->size = s->object_size = object_size; + s->rcu_freeptr_offset = freeptr_offset; s->align = align; s->ctor = ctor; #ifdef CONFIG_HARDENED_USERCOPY s->useroffset = useroffset; s->usersize = usersize; #endif - err = __kmem_cache_create(s, flags); if (err) goto out_free_cache; @@ -240,38 +248,10 @@ out: return ERR_PTR(err); } -/** - * kmem_cache_create_usercopy - Create a cache with a region suitable - * for copying to userspace - * @name: A string which is used in /proc/slabinfo to identify this cache. - * @size: The size of objects to be created in this cache. - * @align: The required alignment for the objects. - * @flags: SLAB flags - * @useroffset: Usercopy region offset - * @usersize: Usercopy region size - * @ctor: A constructor for the objects. - * - * Cannot be called within a interrupt, but can be interrupted. - * The @ctor is run when new pages are allocated by the cache. - * - * The flags are - * - * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) - * to catch references to uninitialised memory. - * - * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check - * for buffer overruns. - * - * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware - * cacheline. This can be beneficial if you're counting cycles as closely - * as davem. - * - * Return: a pointer to the cache on success, NULL on failure. - */ -struct kmem_cache * -kmem_cache_create_usercopy(const char *name, - unsigned int size, unsigned int align, - slab_flags_t flags, +static struct kmem_cache * +do_kmem_cache_create_usercopy(const char *name, + unsigned int size, unsigned int freeptr_offset, + unsigned int align, slab_flags_t flags, unsigned int useroffset, unsigned int usersize, void (*ctor)(void *)) { @@ -331,7 +311,7 @@ kmem_cache_create_usercopy(const char *name, goto out_unlock; } - s = create_cache(cache_name, size, + s = create_cache(cache_name, size, freeptr_offset, calculate_alignment(flags, align, size), flags, useroffset, usersize, ctor); if (IS_ERR(s)) { @@ -355,6 +335,45 @@ out_unlock: } return s; } + +/** + * kmem_cache_create_usercopy - Create a cache with a region suitable + * for copying to userspace + * @name: A string which is used in /proc/slabinfo to identify this cache. + * @size: The size of objects to be created in this cache. + * @freeptr_offset: Custom offset for the free pointer in RCU caches + * @align: The required alignment for the objects. + * @flags: SLAB flags + * @useroffset: Usercopy region offset + * @usersize: Usercopy region size + * @ctor: A constructor for the objects. + * + * Cannot be called within a interrupt, but can be interrupted. + * The @ctor is run when new pages are allocated by the cache. + * + * The flags are + * + * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) + * to catch references to uninitialised memory. + * + * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check + * for buffer overruns. + * + * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware + * cacheline. This can be beneficial if you're counting cycles as closely + * as davem. + * + * Return: a pointer to the cache on success, NULL on failure. + */ +struct kmem_cache * +kmem_cache_create_usercopy(const char *name, unsigned int size, + unsigned int align, slab_flags_t flags, + unsigned int useroffset, unsigned int usersize, + void (*ctor)(void *)) +{ + return do_kmem_cache_create_usercopy(name, size, UINT_MAX, align, flags, + useroffset, usersize, ctor); +} EXPORT_SYMBOL(kmem_cache_create_usercopy); /** @@ -386,11 +405,50 @@ struct kmem_cache * kmem_cache_create(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)) { - return kmem_cache_create_usercopy(name, size, align, flags, 0, 0, - ctor); + return do_kmem_cache_create_usercopy(name, size, UINT_MAX, align, flags, + 0, 0, ctor); } EXPORT_SYMBOL(kmem_cache_create); +/** + * kmem_cache_create_rcu - Create a SLAB_TYPESAFE_BY_RCU cache. + * @name: A string which is used in /proc/slabinfo to identify this cache. + * @size: The size of objects to be created in this cache. + * @freeptr_offset: The offset into the memory to the free pointer + * @flags: SLAB flags + * + * Cannot be called within an interrupt, but can be interrupted. + * + * See kmem_cache_create() for an explanation of possible @flags. + * + * By default SLAB_TYPESAFE_BY_RCU caches place the free pointer outside + * of the object. This might cause the object to grow in size. Callers + * that have a reason to avoid this can specify a custom free pointer + * offset in their struct where the free pointer will be placed. + * + * Note that placing the free pointer inside the object requires the + * caller to ensure that no fields are invalidated that are required to + * guard against object recycling (See SLAB_TYPESAFE_BY_RCU for + * details.). + * + * Using zero as a value for @freeptr_offset is valid. To request no + * offset UINT_MAX must be specified. + * + * Note that @ctor isn't supported with custom free pointers as a @ctor + * requires an external free pointer. + * + * Return: a pointer to the cache on success, NULL on failure. + */ +struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size, + unsigned int freeptr_offset, + slab_flags_t flags) +{ + return do_kmem_cache_create_usercopy(name, size, freeptr_offset, 0, + flags | SLAB_TYPESAFE_BY_RCU, 0, 0, + NULL); +} +EXPORT_SYMBOL(kmem_cache_create_rcu); + static struct kmem_cache *kmem_buckets_cache __ro_after_init; /** diff --git a/mm/slub.c b/mm/slub.c index c9d8a2497fd6..9aa5da1e8e27 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -465,12 +465,6 @@ static struct workqueue_struct *flushwq; * Core slab cache functions *******************************************************************/ -/* - * freeptr_t represents a SLUB freelist pointer, which might be encoded - * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. - */ -typedef struct { unsigned long v; } freeptr_t; - /* * Returns freelist pointer (ptr). With hardening, this is obfuscated * with an XOR of the address where the pointer is held and a per-cache @@ -3921,6 +3915,9 @@ static void *__slab_alloc_node(struct kmem_cache *s, /* * If the object has been wiped upon free, make sure it's fully initialized by * zeroing out freelist pointer. + * + * Note that we also wipe custom freelist pointers specified via + * s->rcu_freeptr_offset. */ static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, void *obj) @@ -5144,6 +5141,12 @@ static void set_cpu_partial(struct kmem_cache *s) #endif } +/* Was a valid freeptr offset requested? */ +static inline bool has_freeptr_offset(const struct kmem_cache *s) +{ + return s->rcu_freeptr_offset != UINT_MAX; +} + /* * calculate_sizes() determines the order and the distribution of data within * a slab object. @@ -5189,7 +5192,8 @@ static int calculate_sizes(struct kmem_cache *s) */ s->inuse = size; - if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || s->ctor || + if (((flags & SLAB_TYPESAFE_BY_RCU) && !has_freeptr_offset(s)) || + (flags & SLAB_POISON) || s->ctor || ((flags & SLAB_RED_ZONE) && (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) { /* @@ -5210,6 +5214,8 @@ static int calculate_sizes(struct kmem_cache *s) */ s->offset = size; size += sizeof(void *); + } else if ((flags & SLAB_TYPESAFE_BY_RCU) && has_freeptr_offset(s)) { + s->offset = s->rcu_freeptr_offset; } else { /* * Store freelist pointer near middle of object to keep From 779d4d7141bbe1ec8eb270dd22f5a484a60d5554 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 28 Aug 2024 12:56:25 +0200 Subject: [PATCH 141/168] fs: use kmem_cache_create_rcu() Switch to the new kmem_cache_create_rcu() helper which allows us to use a custom free pointer offset avoiding the need to have an external free pointer which would grow struct file behind our backs. Link: https://lore.kernel.org/r/20240828-work-kmem_cache-rcu-v3-3-5460bc1f09f6@kernel.org Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner --- fs/file_table.c | 6 +++--- include/linux/fs.h | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/file_table.c b/fs/file_table.c index fdf98709dde2..3ef558f27a1c 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -511,9 +511,9 @@ EXPORT_SYMBOL(__fput_sync); void __init files_init(void) { - filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, - SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN | - SLAB_PANIC | SLAB_ACCOUNT, NULL); + filp_cachep = kmem_cache_create_rcu("filp", sizeof(struct file), + offsetof(struct file, f_freeptr), + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); percpu_counter_init(&nr_files, 0, GFP_KERNEL); } diff --git a/include/linux/fs.h b/include/linux/fs.h index 6b9418f46621..3d538c38256e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1011,6 +1011,7 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) * @f_task_work: task work entry point * @f_llist: work queue entrypoint * @f_ra: file's readahead state + * @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.) */ struct file { atomic_long_t f_count; @@ -1042,6 +1043,7 @@ struct file { struct callback_head f_task_work; struct llist_node f_llist; struct file_ra_state f_ra; + freeptr_t f_freeptr; }; /* --- cacheline 3 boundary (192 bytes) --- */ } __randomize_layout From 6a223e846e6222bec9d57c90d7e0d781ae498aac Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Wed, 14 Aug 2024 17:02:31 +0800 Subject: [PATCH 142/168] autofs: add per dentry expire timeout Add ability to set per-dentry mount expire timeout to autofs. There are two fairly well known automounter map formats, the autofs format and the amd format (more or less System V and Berkley). Some time ago Linux autofs added an amd map format parser that implemented a fair amount of the amd functionality. This was done within the autofs infrastructure and some functionality wasn't implemented because it either didn't make sense or required extra kernel changes. The idea was to restrict changes to be within the existing autofs functionality as much as possible and leave changes with a wider scope to be considered later. One of these changes is implementing the amd options: 1) "unmount", expire this mount according to a timeout (same as the current autofs default). 2) "nounmount", don't expire this mount (same as setting the autofs timeout to 0 except only for this specific mount) . 3) "utimeout=", expire this mount using the specified timeout (again same as setting the autofs timeout but only for this mount). To implement these options per-dentry expire timeouts need to be implemented for autofs indirect mounts. This is because all map keys (mounts) for autofs indirect mounts use an expire timeout stored in the autofs mount super block info. structure and all indirect mounts use the same expire timeout. Now I have a request to add the "nounmount" option so I need to add the per-dentry expire handling to the kernel implementation to do this. The implementation uses the trailing path component to identify the mount (and is also used as the autofs map key) which is passed in the autofs_dev_ioctl structure path field. The expire timeout is passed in autofs_dev_ioctl timeout field (well, of the timeout union). If the passed in timeout is equal to -1 the per-dentry timeout and flag are cleared providing for the "unmount" option. If the timeout is greater than or equal to 0 the timeout is set to the value and the flag is also set. If the dentry timeout is 0 the dentry will not expire by timeout which enables the implementation of the "nounmount" option for the specific mount. When the dentry timeout is greater than zero it allows for the implementation of the "utimeout=" option. Signed-off-by: Ian Kent Link: https://lore.kernel.org/r/20240814090231.963520-1-raven@themaw.net Signed-off-by: Christian Brauner --- fs/autofs/autofs_i.h | 4 ++ fs/autofs/dev-ioctl.c | 97 ++++++++++++++++++++++++++++++++++-- fs/autofs/expire.c | 7 ++- fs/autofs/inode.c | 2 + include/uapi/linux/auto_fs.h | 2 +- 5 files changed, 104 insertions(+), 8 deletions(-) diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 8c1d587b3eef..77c7991d89aa 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -62,6 +62,7 @@ struct autofs_info { struct list_head expiring; struct autofs_sb_info *sbi; + unsigned long exp_timeout; unsigned long last_used; int count; @@ -81,6 +82,9 @@ struct autofs_info { */ #define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */ +#define AUTOFS_INF_EXPIRE_SET (1<<3) /* per-dentry expire timeout set for + this mount point. + */ struct autofs_wait_queue { wait_queue_head_t queue; struct autofs_wait_queue *next; diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c index 5bf781ea6d67..f011e026358e 100644 --- a/fs/autofs/dev-ioctl.c +++ b/fs/autofs/dev-ioctl.c @@ -128,7 +128,13 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) goto out; } + /* Setting the per-dentry expire timeout requires a trailing + * path component, ie. no '/', so invert the logic of the + * check_name() return for AUTOFS_DEV_IOCTL_TIMEOUT_CMD. + */ err = check_name(param->path); + if (cmd == AUTOFS_DEV_IOCTL_TIMEOUT_CMD) + err = err ? 0 : -EINVAL; if (err) { pr_warn("invalid path supplied for cmd(0x%08x)\n", cmd); @@ -396,16 +402,97 @@ static int autofs_dev_ioctl_catatonic(struct file *fp, return 0; } -/* Set the autofs mount timeout */ +/* + * Set the autofs mount expire timeout. + * + * There are two places an expire timeout can be set, in the autofs + * super block info. (this is all that's needed for direct and offset + * mounts because there's a distinct mount corresponding to each of + * these) and per-dentry within within the dentry info. If a per-dentry + * timeout is set it will override the expire timeout set in the parent + * autofs super block info. + * + * If setting the autofs super block expire timeout the autofs_dev_ioctl + * size field will be equal to the autofs_dev_ioctl structure size. If + * setting the per-dentry expire timeout the mount point name is passed + * in the autofs_dev_ioctl path field and the size field updated to + * reflect this. + * + * Setting the autofs mount expire timeout sets the timeout in the super + * block info. struct. Setting the per-dentry timeout does a little more. + * If the timeout is equal to -1 the per-dentry timeout (and flag) is + * cleared which reverts to using the super block timeout, otherwise if + * timeout is 0 the timeout is set to this value and the flag is left + * set which disables expiration for the mount point, lastly the flag + * and the timeout are set enabling the dentry to use this timeout. + */ static int autofs_dev_ioctl_timeout(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { - unsigned long timeout; + unsigned long timeout = param->timeout.timeout; + + /* If setting the expire timeout for an individual indirect + * mount point dentry the mount trailing component path is + * placed in param->path and param->size adjusted to account + * for it otherwise param->size it is set to the structure + * size. + */ + if (param->size == AUTOFS_DEV_IOCTL_SIZE) { + param->timeout.timeout = sbi->exp_timeout / HZ; + sbi->exp_timeout = timeout * HZ; + } else { + struct dentry *base = fp->f_path.dentry; + struct inode *inode = base->d_inode; + int path_len = param->size - AUTOFS_DEV_IOCTL_SIZE - 1; + struct dentry *dentry; + struct autofs_info *ino; + + if (!autofs_type_indirect(sbi->type)) + return -EINVAL; + + /* An expire timeout greater than the superblock timeout + * could be a problem at shutdown but the super block + * timeout itself can change so all we can really do is + * warn the user. + */ + if (timeout >= sbi->exp_timeout) + pr_warn("per-mount expire timeout is greater than " + "the parent autofs mount timeout which could " + "prevent shutdown\n"); + + inode_lock_shared(inode); + dentry = try_lookup_one_len(param->path, base, path_len); + inode_unlock_shared(inode); + if (IS_ERR_OR_NULL(dentry)) + return dentry ? PTR_ERR(dentry) : -ENOENT; + ino = autofs_dentry_ino(dentry); + if (!ino) { + dput(dentry); + return -ENOENT; + } + + if (ino->exp_timeout && ino->flags & AUTOFS_INF_EXPIRE_SET) + param->timeout.timeout = ino->exp_timeout / HZ; + else + param->timeout.timeout = sbi->exp_timeout / HZ; + + if (timeout == -1) { + /* Revert to using the super block timeout */ + ino->flags &= ~AUTOFS_INF_EXPIRE_SET; + ino->exp_timeout = 0; + } else { + /* Set the dentry expire flag and timeout. + * + * If timeout is 0 it will prevent the expire + * of this particular automount. + */ + ino->flags |= AUTOFS_INF_EXPIRE_SET; + ino->exp_timeout = timeout * HZ; + } + dput(dentry); + } - timeout = param->timeout.timeout; - param->timeout.timeout = sbi->exp_timeout / HZ; - sbi->exp_timeout = timeout * HZ; return 0; } diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index 39d8c84c16f4..5c2d459e1e48 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -429,8 +429,6 @@ static struct dentry *autofs_expire_indirect(struct super_block *sb, if (!root) return NULL; - timeout = sbi->exp_timeout; - dentry = NULL; while ((dentry = get_next_positive_subdir(dentry, root))) { spin_lock(&sbi->fs_lock); @@ -441,6 +439,11 @@ static struct dentry *autofs_expire_indirect(struct super_block *sb, } spin_unlock(&sbi->fs_lock); + if (ino->flags & AUTOFS_INF_EXPIRE_SET) + timeout = ino->exp_timeout; + else + timeout = sbi->exp_timeout; + expired = should_expire(dentry, mnt, timeout, how); if (!expired) continue; diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index 64faa6c51f60..ee2edccaef70 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -19,6 +19,7 @@ struct autofs_info *autofs_new_ino(struct autofs_sb_info *sbi) INIT_LIST_HEAD(&ino->expiring); ino->last_used = jiffies; ino->sbi = sbi; + ino->exp_timeout = -1; ino->count = 1; } return ino; @@ -28,6 +29,7 @@ void autofs_clean_ino(struct autofs_info *ino) { ino->uid = GLOBAL_ROOT_UID; ino->gid = GLOBAL_ROOT_GID; + ino->exp_timeout = -1; ino->last_used = jiffies; } diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h index 1f7925afad2d..8081df849743 100644 --- a/include/uapi/linux/auto_fs.h +++ b/include/uapi/linux/auto_fs.h @@ -23,7 +23,7 @@ #define AUTOFS_MIN_PROTO_VERSION 3 #define AUTOFS_MAX_PROTO_VERSION 5 -#define AUTOFS_PROTO_SUBVERSION 5 +#define AUTOFS_PROTO_SUBVERSION 6 /* * The wait_queue_token (autofs_wqt_t) is part of a structure which is passed From 8fe15dd70207ae341bcb89f12175492064f34fce Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 15 Aug 2024 10:33:10 +0200 Subject: [PATCH 143/168] vfs: elide smp_mb in iversion handling in the common case According to bpftrace on these routines most calls result in cmpxchg, which already provides the same guarantee. In inode_maybe_inc_iversion elision is possible because even if the wrong value was read due to now missing smp_mb fence, the issue is going to correct itself after cmpxchg. If it appears cmpxchg wont be issued, the fence + reload are there bringing back previous behavior. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20240815083310.3865-1-mjguzik@gmail.com Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/libfs.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index 02602d00939e..7874b23364e1 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -2003,13 +2003,19 @@ bool inode_maybe_inc_iversion(struct inode *inode, bool force) * information, but the legacy inode_inc_iversion code used a spinlock * to serialize increments. * - * Here, we add full memory barriers to ensure that any de-facto - * ordering with other info is preserved. + * We add a full memory barrier to ensure that any de facto ordering + * with other state is preserved (either implicitly coming from cmpxchg + * or explicitly from smp_mb if we don't know upfront if we will execute + * the former). * - * This barrier pairs with the barrier in inode_query_iversion() + * These barriers pair with inode_query_iversion(). */ - smp_mb(); cur = inode_peek_iversion_raw(inode); + if (!force && !(cur & I_VERSION_QUERIED)) { + smp_mb(); + cur = inode_peek_iversion_raw(inode); + } + do { /* If flag is clear then we needn't do anything */ if (!force && !(cur & I_VERSION_QUERIED)) @@ -2038,20 +2044,22 @@ EXPORT_SYMBOL(inode_maybe_inc_iversion); u64 inode_query_iversion(struct inode *inode) { u64 cur, new; + bool fenced = false; + /* + * Memory barriers (implicit in cmpxchg, explicit in smp_mb) pair with + * inode_maybe_inc_iversion(), see that routine for more details. + */ cur = inode_peek_iversion_raw(inode); do { /* If flag is already set, then no need to swap */ if (cur & I_VERSION_QUERIED) { - /* - * This barrier (and the implicit barrier in the - * cmpxchg below) pairs with the barrier in - * inode_maybe_inc_iversion(). - */ - smp_mb(); + if (!fenced) + smp_mb(); break; } + fenced = true; new = cur | I_VERSION_QUERIED; } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); return cur >> I_VERSION_QUERIED_SHIFT; From 489e17fed2210508db462acef249b89d3eec6c5d Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Fri, 16 Aug 2024 14:38:49 +0800 Subject: [PATCH 144/168] fs: Use in_group_or_capable() helper to simplify the code Since in_group_or_capable has been exported, we can use it to simplify the code when check group and capable. Signed-off-by: Hongbo Li Link: https://lore.kernel.org/r/20240816063849.1989856-1-lihongbo22@huawei.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/posix_acl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 3f87297dbfdb..6c66a37522d0 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -715,8 +715,8 @@ int posix_acl_update_mode(struct mnt_idmap *idmap, return error; if (error == 0) *acl = NULL; - if (!vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode)) && - !capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID)) + if (!in_group_or_capable(idmap, inode, + i_gid_into_vfsgid(idmap, inode))) mode &= ~S_ISGID; *mode_p = mode; return 0; From afe9bc60b7ec9433ecf2c32b550eeb9fc1d6b647 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Fri, 16 Aug 2024 14:36:11 +0800 Subject: [PATCH 145/168] doc: correcting the idmapping mount example In step 2, we obtain the kernel id `k1000`. So in next step (step 3), we should translate the `k1000` not `k21000`. Signed-off-by: Hongbo Li Link: https://lore.kernel.org/r/20240816063611.1961910-1-lihongbo22@huawei.com Signed-off-by: Christian Brauner --- Documentation/filesystems/idmappings.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/filesystems/idmappings.rst b/Documentation/filesystems/idmappings.rst index ac0af679e61e..77930c77fcfe 100644 --- a/Documentation/filesystems/idmappings.rst +++ b/Documentation/filesystems/idmappings.rst @@ -821,7 +821,7 @@ the same idmapping to the mount. We now perform three steps: /* Map the userspace id down into a kernel id in the filesystem's idmapping. */ make_kuid(u0:k20000:r10000, u1000) = k21000 -2. Verify that the caller's kernel ids can be mapped to userspace ids in the +3. Verify that the caller's kernel ids can be mapped to userspace ids in the filesystem's idmapping:: from_kuid(u0:k20000:r10000, k21000) = u1000 @@ -854,10 +854,10 @@ The same translation algorithm works with the third example. /* Map the userspace id down into a kernel id in the filesystem's idmapping. */ make_kuid(u0:k0:r4294967295, u1000) = k1000 -2. Verify that the caller's kernel ids can be mapped to userspace ids in the +3. Verify that the caller's kernel ids can be mapped to userspace ids in the filesystem's idmapping:: - from_kuid(u0:k0:r4294967295, k21000) = u1000 + from_kuid(u0:k0:r4294967295, k1000) = u1000 So the ownership that lands on disk will be ``u1000``. @@ -994,7 +994,7 @@ from above::: /* Map the userspace id down into a kernel id in the filesystem's idmapping. */ make_kuid(u0:k0:r4294967295, u1000) = k1000 -2. Verify that the caller's filesystem ids can be mapped to userspace ids in the +3. Verify that the caller's filesystem ids can be mapped to userspace ids in the filesystem's idmapping:: from_kuid(u0:k0:r4294967295, k1000) = u1000 From c943c82f614f7ac8af69fb2cff0226c0ce5619f2 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 16 Aug 2024 16:35:52 +0200 Subject: [PATCH 146/168] inode: remove __I_DIO_WAKEUP Afaict, we can just rely on inode->i_dio_count for waiting instead of this awkward indirection through __I_DIO_WAKEUP. This survives LTP dio and xfstests dio tests. Link: https://lore.kernel.org/r/20240816-vfs-misc-dio-v1-1-80fe21a2c710@kernel.org Reviewed-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/inode.c | 23 +++++++++++------------ fs/netfs/locking.c | 22 +++++----------------- include/linux/fs.h | 8 +++----- 3 files changed, 19 insertions(+), 34 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index de0969d6009c..248a131a02c3 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2500,18 +2500,11 @@ EXPORT_SYMBOL(inode_owner_or_capable); /* * Direct i/o helper functions */ -static void __inode_dio_wait(struct inode *inode) +bool inode_dio_finished(const struct inode *inode) { - wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP); - DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP); - - do { - prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE); - if (atomic_read(&inode->i_dio_count)) - schedule(); - } while (atomic_read(&inode->i_dio_count)); - finish_wait(wq, &q.wq_entry); + return atomic_read(&inode->i_dio_count) == 0; } +EXPORT_SYMBOL(inode_dio_finished); /** * inode_dio_wait - wait for outstanding DIO requests to finish @@ -2525,11 +2518,17 @@ static void __inode_dio_wait(struct inode *inode) */ void inode_dio_wait(struct inode *inode) { - if (atomic_read(&inode->i_dio_count)) - __inode_dio_wait(inode); + wait_var_event(&inode->i_dio_count, inode_dio_finished(inode)); } EXPORT_SYMBOL(inode_dio_wait); +void inode_dio_wait_interruptible(struct inode *inode) +{ + wait_var_event_interruptible(&inode->i_dio_count, + inode_dio_finished(inode)); +} +EXPORT_SYMBOL(inode_dio_wait_interruptible); + /* * inode_set_flags - atomically set some inode flags * diff --git a/fs/netfs/locking.c b/fs/netfs/locking.c index 75dc52a49b3a..21eab56ee2f9 100644 --- a/fs/netfs/locking.c +++ b/fs/netfs/locking.c @@ -19,25 +19,13 @@ * Must be called under a lock that serializes taking new references * to i_dio_count, usually by inode->i_mutex. */ -static int inode_dio_wait_interruptible(struct inode *inode) +static int netfs_inode_dio_wait_interruptible(struct inode *inode) { - if (!atomic_read(&inode->i_dio_count)) + if (inode_dio_finished(inode)) return 0; - wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP); - DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP); - - for (;;) { - prepare_to_wait(wq, &q.wq_entry, TASK_INTERRUPTIBLE); - if (!atomic_read(&inode->i_dio_count)) - break; - if (signal_pending(current)) - break; - schedule(); - } - finish_wait(wq, &q.wq_entry); - - return atomic_read(&inode->i_dio_count) ? -ERESTARTSYS : 0; + inode_dio_wait_interruptible(inode); + return !inode_dio_finished(inode) ? -ERESTARTSYS : 0; } /* Call with exclusively locked inode->i_rwsem */ @@ -46,7 +34,7 @@ static int netfs_block_o_direct(struct netfs_inode *ictx) if (!test_bit(NETFS_ICTX_ODIRECT, &ictx->flags)) return 0; clear_bit(NETFS_ICTX_ODIRECT, &ictx->flags); - return inode_dio_wait_interruptible(&ictx->inode); + return netfs_inode_dio_wait_interruptible(&ictx->inode); } /** diff --git a/include/linux/fs.h b/include/linux/fs.h index 0145bda465ff..8f8d274929d7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2373,8 +2373,6 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, * * I_REFERENCED Marks the inode as recently references on the LRU list. * - * I_DIO_WAKEUP Never set. Only used as a key for wait_on_bit(). - * * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to * synchronize competing switching instances and to tell * wb stat updates to grab the i_pages lock. See @@ -2409,8 +2407,6 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, #define __I_SYNC 7 #define I_SYNC (1 << __I_SYNC) #define I_REFERENCED (1 << 8) -#define __I_DIO_WAKEUP 9 -#define I_DIO_WAKEUP (1 << __I_DIO_WAKEUP) #define I_LINKABLE (1 << 10) #define I_DIRTY_TIME (1 << 11) #define I_WB_SWITCH (1 << 13) @@ -3227,7 +3223,9 @@ static inline ssize_t blockdev_direct_IO(struct kiocb *iocb, } #endif +bool inode_dio_finished(const struct inode *inode); void inode_dio_wait(struct inode *inode); +void inode_dio_wait_interruptible(struct inode *inode); /** * inode_dio_begin - signal start of a direct I/O requests @@ -3251,7 +3249,7 @@ static inline void inode_dio_begin(struct inode *inode) static inline void inode_dio_end(struct inode *inode) { if (atomic_dec_and_test(&inode->i_dio_count)) - wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); + wake_up_var(&inode->i_dio_count); } extern void inode_set_flags(struct inode *inode, unsigned int flags, From a2e2178f3209de4b565fa25328a3906b8162c18b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Aur=C3=A8le=20La=20France?= Date: Sat, 10 Aug 2024 13:25:27 -0600 Subject: [PATCH 147/168] debugfs show actual source in /proc/mounts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After its conversion to the new mount API, debugfs displays "none" in /proc/mounts instead of the actual source. Fix this by recognising its "source" mount option. Signed-off-by: Marc Aurèle La France Link: https://lore.kernel.org/r/e439fae2-01da-234b-75b9-2a7951671e27@tuyoix.net Fixes: a20971c18752 ("vfs: Convert debugfs to use the new mount API") Cc: stable@vger.kernel.org # 6.10.x: 49abee5991e1: debugfs: Convert to new uid/gid option parsing helpers Signed-off-by: Christian Brauner --- fs/debugfs/inode.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 91521576f500..66d9b3b4c588 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -89,12 +89,14 @@ enum { Opt_uid, Opt_gid, Opt_mode, + Opt_source, }; static const struct fs_parameter_spec debugfs_param_specs[] = { fsparam_gid ("gid", Opt_gid), fsparam_u32oct ("mode", Opt_mode), fsparam_uid ("uid", Opt_uid), + fsparam_string ("source", Opt_source), {} }; @@ -126,6 +128,12 @@ static int debugfs_parse_param(struct fs_context *fc, struct fs_parameter *param case Opt_mode: opts->mode = result.uint_32 & S_IALLUGO; break; + case Opt_source: + if (fc->source) + return invalfc(fc, "Multiple sources specified"); + fc->source = param->string; + param->string = NULL; + break; /* * We might like to report bad mount options here; * but traditionally debugfs has ignored all mount options From df37e1e53c7c53577d09a4a905f94e3c79a49fd3 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 13 Aug 2024 16:36:26 +0200 Subject: [PATCH 148/168] vfs: drop one lock trip in evict() Most commonly neither I_LRU_ISOLATING nor I_SYNC are set, but the stock kernel takes a back-to-back relock trip to check for them. It probably can be avoided altogether, but for now massage things back to just one lock acquire. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20240813143626.1573445-1-mjguzik@gmail.com Reviewed-by: Zhihao Cheng Reviewed-by: Jeff Layton Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 17 +++-------------- fs/inode.c | 5 +++-- 2 files changed, 6 insertions(+), 16 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 4451ecff37c4..1a5006329f6f 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1510,13 +1510,12 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc) * Wait for writeback on an inode to complete. Called with i_lock held. * Caller must make sure inode cannot go away when we drop i_lock. */ -static void __inode_wait_for_writeback(struct inode *inode) - __releases(inode->i_lock) - __acquires(inode->i_lock) +void inode_wait_for_writeback(struct inode *inode) { DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); wait_queue_head_t *wqh; + lockdep_assert_held(&inode->i_lock); wqh = bit_waitqueue(&inode->i_state, __I_SYNC); while (inode->i_state & I_SYNC) { spin_unlock(&inode->i_lock); @@ -1526,16 +1525,6 @@ static void __inode_wait_for_writeback(struct inode *inode) } } -/* - * Wait for writeback on an inode to complete. Caller must have inode pinned. - */ -void inode_wait_for_writeback(struct inode *inode) -{ - spin_lock(&inode->i_lock); - __inode_wait_for_writeback(inode); - spin_unlock(&inode->i_lock); -} - /* * Sleep until I_SYNC is cleared. This function must be called with i_lock * held and drops it. It is aimed for callers not holding any inode reference @@ -1757,7 +1746,7 @@ static int writeback_single_inode(struct inode *inode, */ if (wbc->sync_mode != WB_SYNC_ALL) goto out; - __inode_wait_for_writeback(inode); + inode_wait_for_writeback(inode); } WARN_ON(inode->i_state & I_SYNC); /* diff --git a/fs/inode.c b/fs/inode.c index 248a131a02c3..154f8689457f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -507,7 +507,7 @@ static void inode_unpin_lru_isolating(struct inode *inode) static void inode_wait_for_lru_isolating(struct inode *inode) { - spin_lock(&inode->i_lock); + lockdep_assert_held(&inode->i_lock); if (inode->i_state & I_LRU_ISOLATING) { DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LRU_ISOLATING); wait_queue_head_t *wqh; @@ -518,7 +518,6 @@ static void inode_wait_for_lru_isolating(struct inode *inode) spin_lock(&inode->i_lock); WARN_ON(inode->i_state & I_LRU_ISOLATING); } - spin_unlock(&inode->i_lock); } /** @@ -690,6 +689,7 @@ static void evict(struct inode *inode) inode_sb_list_del(inode); + spin_lock(&inode->i_lock); inode_wait_for_lru_isolating(inode); /* @@ -699,6 +699,7 @@ static void evict(struct inode *inode) * the inode. We just have to wait for running writeback to finish. */ inode_wait_for_writeback(inode); + spin_unlock(&inode->i_lock); if (op->evict_inode) { op->evict_inode(inode); From 6af0ec2a0ae2283117a333da4ef319d3df9c00ce Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Aug 2024 11:28:38 +0200 Subject: [PATCH 149/168] fs: remove unused path_put_init() This helper has been unused for a while now. Link: https://lore.kernel.org/r/20240822-bewuchs-werktag-46672b3c0606@brauner Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/path.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/include/linux/path.h b/include/linux/path.h index ca073e70decd..7ea389dc764b 100644 --- a/include/linux/path.h +++ b/include/linux/path.h @@ -18,12 +18,6 @@ static inline int path_equal(const struct path *path1, const struct path *path2) return path1->mnt == path2->mnt && path1->dentry == path2->dentry; } -static inline void path_put_init(struct path *path) -{ - path_put(path); - *path = (struct path) { }; -} - /* * Cleanup macro for use with __free(path_put). Avoids dereference and * copying @path unlike DEFINE_FREE(). path_put() will handle the empty From 3125625795796e591bf151135f1b908f7dbd75cd Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Aug 2024 11:30:58 +0200 Subject: [PATCH 150/168] fs: s/__u32/u32/ for s_fsnotify_mask The underscore variants are for uapi whereas the non-underscore variants are for in-kernel consumers. Link: https://lore.kernel.org/r/20240822-anwerben-nutzung-1cd6c82a565f@brauner Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 8f8d274929d7..cda1f2545ea0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1266,7 +1266,7 @@ struct super_block { time64_t s_time_min; time64_t s_time_max; #ifdef CONFIG_FSNOTIFY - __u32 s_fsnotify_mask; + u32 s_fsnotify_mask; struct fsnotify_sb_info *s_fsnotify_info; #endif From 6a4d23515f3b8c0a4da0c37d19effbcaa73a6f40 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 20 Aug 2024 12:51:09 -0700 Subject: [PATCH 151/168] MAINTAINERS: add the VFS git tree The VFS git tree is missing from MAINTAINERS. Add it. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20240820195109.38906-1-ebiggers@kernel.org Signed-off-by: Christian Brauner --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index f328373463b0..c63e588e87dd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8597,6 +8597,7 @@ M: Christian Brauner R: Jan Kara L: linux-fsdevel@vger.kernel.org S: Maintained +T: git https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git F: fs/* F: include/linux/fs.h F: include/linux/fs_types.h From d126e3782fd87c0eea6b7aebe78e0968fb9fbe02 Mon Sep 17 00:00:00 2001 From: Julian Sun Date: Fri, 23 Aug 2024 21:07:30 +0800 Subject: [PATCH 152/168] vfs: fix race between evice_inodes() and find_inode()&iput() Hi, all Recently I noticed a bug[1] in btrfs, after digged it into and I believe it'a race in vfs. Let's assume there's a inode (ie ino 261) with i_count 1 is called by iput(), and there's a concurrent thread calling generic_shutdown_super(). cpu0: cpu1: iput() // i_count is 1 ->spin_lock(inode) ->dec i_count to 0 ->iput_final() generic_shutdown_super() ->__inode_add_lru() ->evict_inodes() // cause some reason[2] ->if (atomic_read(inode->i_count)) continue; // return before // inode 261 passed the above check // list_lru_add_obj() // and then schedule out ->spin_unlock() // note here: the inode 261 // was still at sb list and hash list, // and I_FREEING|I_WILL_FREE was not been set btrfs_iget() // after some function calls ->find_inode() // found the above inode 261 ->spin_lock(inode) // check I_FREEING|I_WILL_FREE // and passed ->__iget() ->spin_unlock(inode) // schedule back ->spin_lock(inode) // check (I_NEW|I_FREEING|I_WILL_FREE) flags, // passed and set I_FREEING iput() ->spin_unlock(inode) ->spin_lock(inode) ->evict() // dec i_count to 0 ->iput_final() ->spin_unlock() ->evict() Now, we have two threads simultaneously evicting the same inode, which may trigger the BUG(inode->i_state & I_CLEAR) statement both within clear_inode() and iput(). To fix the bug, recheck the inode->i_count after holding i_lock. Because in the most scenarios, the first check is valid, and the overhead of spin_lock() can be reduced. If there is any misunderstanding, please let me know, thanks. [1]: https://lore.kernel.org/linux-btrfs/000000000000eabe1d0619c48986@google.com/ [2]: The reason might be 1. SB_ACTIVE was removed or 2. mapping_shrinkable() return false when I reproduced the bug. Reported-by: syzbot+67ba3c42bcbb4665d3ad@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=67ba3c42bcbb4665d3ad CC: stable@vger.kernel.org Fixes: 63997e98a3be ("split invalidate_inodes()") Signed-off-by: Julian Sun Link: https://lore.kernel.org/r/20240823130730.658881-1-sunjunchao2870@gmail.com Signed-off-by: Christian Brauner --- fs/inode.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/inode.c b/fs/inode.c index 154f8689457f..ba1645a09603 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -771,6 +771,10 @@ again: continue; spin_lock(&inode->i_lock); + if (atomic_read(&inode->i_count)) { + spin_unlock(&inode->i_lock); + continue; + } if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { spin_unlock(&inode->i_lock); continue; From 7864f8a6cc92856b4bd856a76f2947942207a434 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 23 Aug 2024 14:47:35 +0200 Subject: [PATCH 153/168] fs: add i_state helpers The i_state member is an unsigned long so that it can be used with the wait bit infrastructure which expects unsigned long. This wastes 4 bytes which we're unlikely to ever use. Switch to using the var event wait mechanism using the address of the bit. Thanks to Linus for the address idea. Link: https://lore.kernel.org/r/20240823-work-i_state-v3-1-5cd5fd207a57@kernel.org Reviewed-by: Josef Bacik Signed-off-by: Christian Brauner --- fs/inode.c | 11 +++++++++++ include/linux/fs.h | 15 +++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/fs/inode.c b/fs/inode.c index ba1645a09603..1b0f52ebae27 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -472,6 +472,17 @@ static void __inode_add_lru(struct inode *inode, bool rotate) inode->i_state |= I_REFERENCED; } +struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe, + struct inode *inode, u32 bit) +{ + void *bit_address; + + bit_address = inode_state_wait_address(inode, bit); + init_wait_var_entry(wqe, bit_address, 0); + return __var_waitqueue(bit_address); +} +EXPORT_SYMBOL(inode_bit_waitqueue); + /* * Add inode to LRU if needed (inode is unused and clean). * diff --git a/include/linux/fs.h b/include/linux/fs.h index cda1f2545ea0..d35de348d2ec 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -744,6 +744,21 @@ struct inode { void *i_private; /* fs or device private pointer */ } __randomize_layout; +/* + * Get bit address from inode->i_state to use with wait_var_event() + * infrastructre. + */ +#define inode_state_wait_address(inode, bit) ((char *)&(inode)->i_state + (bit)) + +struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe, + struct inode *inode, u32 bit); + +static inline void inode_wake_up_bit(struct inode *inode, u32 bit) +{ + /* Caller is responsible for correct memory barriers. */ + wake_up_var(inode_state_wait_address(inode, bit)); +} + struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode); static inline unsigned int i_blocksize(const struct inode *node) From 86aa46a86f9c01f1ab91acd243a9e5ba460b9dcd Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 23 Aug 2024 14:47:36 +0200 Subject: [PATCH 154/168] fs: reorder i_state bits so that we can use the first bits to derive unique addresses from i_state. Link: https://lore.kernel.org/r/20240823-work-i_state-v3-2-5cd5fd207a57@kernel.org Reviewed-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/fs.h | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index d35de348d2ec..a868c9823c10 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2410,28 +2410,32 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, * i_count. * * Q: What is the difference between I_WILL_FREE and I_FREEING? + * + * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait + * upon. There's one free address left. */ -#define I_DIRTY_SYNC (1 << 0) -#define I_DIRTY_DATASYNC (1 << 1) -#define I_DIRTY_PAGES (1 << 2) -#define __I_NEW 3 +#define __I_NEW 0 #define I_NEW (1 << __I_NEW) -#define I_WILL_FREE (1 << 4) -#define I_FREEING (1 << 5) -#define I_CLEAR (1 << 6) -#define __I_SYNC 7 +#define __I_SYNC 1 #define I_SYNC (1 << __I_SYNC) -#define I_REFERENCED (1 << 8) +#define __I_LRU_ISOLATING 2 +#define I_LRU_ISOLATING (1 << __I_LRU_ISOLATING) + +#define I_DIRTY_SYNC (1 << 3) +#define I_DIRTY_DATASYNC (1 << 4) +#define I_DIRTY_PAGES (1 << 5) +#define I_WILL_FREE (1 << 6) +#define I_FREEING (1 << 7) +#define I_CLEAR (1 << 8) +#define I_REFERENCED (1 << 9) #define I_LINKABLE (1 << 10) #define I_DIRTY_TIME (1 << 11) -#define I_WB_SWITCH (1 << 13) -#define I_OVL_INUSE (1 << 14) -#define I_CREATING (1 << 15) -#define I_DONTCACHE (1 << 16) -#define I_SYNC_QUEUED (1 << 17) -#define I_PINNING_NETFS_WB (1 << 18) -#define __I_LRU_ISOLATING 19 -#define I_LRU_ISOLATING (1 << __I_LRU_ISOLATING) +#define I_WB_SWITCH (1 << 12) +#define I_OVL_INUSE (1 << 13) +#define I_CREATING (1 << 14) +#define I_DONTCACHE (1 << 15) +#define I_SYNC_QUEUED (1 << 16) +#define I_PINNING_NETFS_WB (1 << 17) #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) From f9fc3f2b5e3b8d0093dd435bfc63f998400776be Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 23 Aug 2024 14:47:37 +0200 Subject: [PATCH 155/168] inode: port __I_SYNC to var event Port the __I_SYNC mechanism to use the new var event mechanism. Link: https://lore.kernel.org/r/20240823-work-i_state-v3-3-5cd5fd207a57@kernel.org Reviewed-by: Josef Bacik Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 45 +++++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 1a5006329f6f..d8bec3c1bb1f 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1386,12 +1386,13 @@ static void requeue_io(struct inode *inode, struct bdi_writeback *wb) static void inode_sync_complete(struct inode *inode) { + assert_spin_locked(&inode->i_lock); + inode->i_state &= ~I_SYNC; /* If inode is clean an unused, put it into LRU now... */ inode_add_lru(inode); - /* Waiters must see I_SYNC cleared before being woken up */ - smp_mb(); - wake_up_bit(&inode->i_state, __I_SYNC); + /* Called with inode->i_lock which ensures memory ordering. */ + inode_wake_up_bit(inode, __I_SYNC); } static bool inode_dirtied_after(struct inode *inode, unsigned long t) @@ -1512,17 +1513,25 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc) */ void inode_wait_for_writeback(struct inode *inode) { - DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); - wait_queue_head_t *wqh; + struct wait_bit_queue_entry wqe; + struct wait_queue_head *wq_head; - lockdep_assert_held(&inode->i_lock); - wqh = bit_waitqueue(&inode->i_state, __I_SYNC); - while (inode->i_state & I_SYNC) { + assert_spin_locked(&inode->i_lock); + + if (!(inode->i_state & I_SYNC)) + return; + + wq_head = inode_bit_waitqueue(&wqe, inode, __I_SYNC); + for (;;) { + prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); + /* Checking I_SYNC with inode->i_lock guarantees memory ordering. */ + if (!(inode->i_state & I_SYNC)) + break; spin_unlock(&inode->i_lock); - __wait_on_bit(wqh, &wq, bit_wait, - TASK_UNINTERRUPTIBLE); + schedule(); spin_lock(&inode->i_lock); } + finish_wait(wq_head, &wqe.wq_entry); } /* @@ -1533,16 +1542,20 @@ void inode_wait_for_writeback(struct inode *inode) static void inode_sleep_on_writeback(struct inode *inode) __releases(inode->i_lock) { - DEFINE_WAIT(wait); - wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC); - int sleep; + struct wait_bit_queue_entry wqe; + struct wait_queue_head *wq_head; + bool sleep; - prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); - sleep = inode->i_state & I_SYNC; + assert_spin_locked(&inode->i_lock); + + wq_head = inode_bit_waitqueue(&wqe, inode, __I_SYNC); + prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); + /* Checking I_SYNC with inode->i_lock guarantees memory ordering. */ + sleep = !!(inode->i_state & I_SYNC); spin_unlock(&inode->i_lock); if (sleep) schedule(); - finish_wait(wqh, &wait); + finish_wait(wq_head, &wqe.wq_entry); } /* From 2cef0e1dc8e62bd656dddd7e2f125ce1a2218dab Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 23 Aug 2024 14:47:38 +0200 Subject: [PATCH 156/168] inode: port __I_NEW to var event Port the __I_NEW mechanism to use the new var event mechanism. Link: https://lore.kernel.org/r/20240823-work-i_state-v3-4-5cd5fd207a57@kernel.org Reviewed-by: Josef Bacik Signed-off-by: Christian Brauner --- fs/bcachefs/fs.c | 10 ++++++---- fs/dcache.c | 7 ++++++- fs/inode.c | 32 ++++++++++++++++++++++++-------- include/linux/writeback.h | 3 ++- 4 files changed, 38 insertions(+), 14 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 94c392abef65..c0900c0c0f8a 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1644,14 +1644,16 @@ again: break; } } else if (clean_pass && this_pass_clean) { - wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW); - DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW); + struct wait_bit_queue_entry wqe; + struct wait_queue_head *wq_head; - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW); + prepare_to_wait_event(wq_head, &wqe.wq_entry, + TASK_UNINTERRUPTIBLE); mutex_unlock(&c->vfs_inodes_lock); schedule(); - finish_wait(wq, &wait.wq_entry); + finish_wait(wq_head, &wqe.wq_entry); goto again; } } diff --git a/fs/dcache.c b/fs/dcache.c index 1af75fa68638..894e38cdf4d0 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1908,8 +1908,13 @@ void d_instantiate_new(struct dentry *entry, struct inode *inode) __d_instantiate(entry, inode); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW & ~I_CREATING; + /* + * Pairs with the barrier in prepare_to_wait_event() to make sure + * ___wait_var_event() either sees the bit cleared or + * waitqueue_active() check in wake_up_var() sees the waiter. + */ smp_mb(); - wake_up_bit(&inode->i_state, __I_NEW); + inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(d_instantiate_new); diff --git a/fs/inode.c b/fs/inode.c index 1b0f52ebae27..1aa785421f13 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -734,7 +734,13 @@ static void evict(struct inode *inode) * used as an indicator whether blocking on it is safe. */ spin_lock(&inode->i_lock); - wake_up_bit(&inode->i_state, __I_NEW); + /* + * Pairs with the barrier in prepare_to_wait_event() to make sure + * ___wait_var_event() either sees the bit cleared or + * waitqueue_active() check in wake_up_var() sees the waiter. + */ + smp_mb(); + inode_wake_up_bit(inode, __I_NEW); BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); spin_unlock(&inode->i_lock); @@ -1146,8 +1152,13 @@ void unlock_new_inode(struct inode *inode) spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW & ~I_CREATING; + /* + * Pairs with the barrier in prepare_to_wait_event() to make sure + * ___wait_var_event() either sees the bit cleared or + * waitqueue_active() check in wake_up_var() sees the waiter. + */ smp_mb(); - wake_up_bit(&inode->i_state, __I_NEW); + inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(unlock_new_inode); @@ -1158,8 +1169,13 @@ void discard_new_inode(struct inode *inode) spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW; + /* + * Pairs with the barrier in prepare_to_wait_event() to make sure + * ___wait_var_event() either sees the bit cleared or + * waitqueue_active() check in wake_up_var() sees the waiter. + */ smp_mb(); - wake_up_bit(&inode->i_state, __I_NEW); + inode_wake_up_bit(inode, __I_NEW); spin_unlock(&inode->i_lock); iput(inode); } @@ -2348,8 +2364,8 @@ EXPORT_SYMBOL(inode_needs_sync); */ static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_locked) { - wait_queue_head_t *wq; - DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); + struct wait_bit_queue_entry wqe; + struct wait_queue_head *wq_head; /* * Handle racing against evict(), see that routine for more details. @@ -2360,14 +2376,14 @@ static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_lock return; } - wq = bit_waitqueue(&inode->i_state, __I_NEW); - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + wq_head = inode_bit_waitqueue(&wqe, inode, __I_NEW); + prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->i_lock); rcu_read_unlock(); if (is_inode_hash_locked) spin_unlock(&inode_hash_lock); schedule(); - finish_wait(wq, &wait.wq_entry); + finish_wait(wq_head, &wqe.wq_entry); if (is_inode_hash_locked) spin_lock(&inode_hash_lock); rcu_read_lock(); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 56b85841ae4c..8f651bb0a1a5 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -200,7 +200,8 @@ void inode_io_list_del(struct inode *inode); /* writeback.h requires fs.h; it, too, is not included from here. */ static inline void wait_on_inode(struct inode *inode) { - wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE); + wait_var_event(inode_state_wait_address(inode, __I_NEW), + !(READ_ONCE(inode->i_state) & I_NEW)); } #ifdef CONFIG_CGROUP_WRITEBACK From 34b10fce2bb969ee90d786e024f074e9cad456c9 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 23 Aug 2024 14:47:39 +0200 Subject: [PATCH 157/168] inode: port __I_LRU_ISOLATING to var event Port the __I_LRU_ISOLATING mechanism to use the new var event mechanism. Link: https://lore.kernel.org/r/20240823-work-i_state-v3-5-5cd5fd207a57@kernel.org Reviewed-by: Josef Bacik Signed-off-by: Christian Brauner --- fs/inode.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 1aa785421f13..aacd05749c1f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -511,24 +511,35 @@ static void inode_unpin_lru_isolating(struct inode *inode) spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_LRU_ISOLATING)); inode->i_state &= ~I_LRU_ISOLATING; - smp_mb(); - wake_up_bit(&inode->i_state, __I_LRU_ISOLATING); + /* Called with inode->i_lock which ensures memory ordering. */ + inode_wake_up_bit(inode, __I_LRU_ISOLATING); spin_unlock(&inode->i_lock); } static void inode_wait_for_lru_isolating(struct inode *inode) { - lockdep_assert_held(&inode->i_lock); - if (inode->i_state & I_LRU_ISOLATING) { - DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LRU_ISOLATING); - wait_queue_head_t *wqh; + struct wait_bit_queue_entry wqe; + struct wait_queue_head *wq_head; - wqh = bit_waitqueue(&inode->i_state, __I_LRU_ISOLATING); + lockdep_assert_held(&inode->i_lock); + if (!(inode->i_state & I_LRU_ISOLATING)) + return; + + wq_head = inode_bit_waitqueue(&wqe, inode, __I_LRU_ISOLATING); + for (;;) { + prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); + /* + * Checking I_LRU_ISOLATING with inode->i_lock guarantees + * memory ordering. + */ + if (!(inode->i_state & I_LRU_ISOLATING)) + break; spin_unlock(&inode->i_lock); - __wait_on_bit(wqh, &wq, bit_wait, TASK_UNINTERRUPTIBLE); + schedule(); spin_lock(&inode->i_lock); - WARN_ON(inode->i_state & I_LRU_ISOLATING); } + finish_wait(wq_head, &wqe.wq_entry); + WARN_ON(inode->i_state & I_LRU_ISOLATING); } /** From dbd5479b432492b47490021c843e8c1f36d0ec7e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 23 Aug 2024 14:47:40 +0200 Subject: [PATCH 158/168] inode: make i_state a u32 Now that we use the wait var event mechanism make i_state a u32 and free up 4 bytes. This means we currently have two 4 byte holes in struct inode which we can pack. Link: https://lore.kernel.org/r/20240823-work-i_state-v3-6-5cd5fd207a57@kernel.org Reviewed-by: Josef Bacik Signed-off-by: Christian Brauner --- include/linux/fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index a868c9823c10..a1ef8c65d828 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -681,7 +681,8 @@ struct inode { #endif /* Misc */ - unsigned long i_state; + u32 i_state; + /* 32-bit hole */ struct rw_semaphore i_rwsem; unsigned long dirtied_when; /* jiffies of first dirtying */ From c594d308f43f053f71e3c4072deda215c84477cc Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Wed, 21 Aug 2024 14:54:56 +0800 Subject: [PATCH 159/168] fs: use LIST_HEAD() to simplify code list_head can be initialized automatically with LIST_HEAD() instead of calling INIT_LIST_HEAD(). Signed-off-by: Hongbo Li Link: https://lore.kernel.org/r/20240821065456.2294216-1-lihongbo22@huawei.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/buffer.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index e55ad471c530..31a9062cad7e 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -774,12 +774,11 @@ EXPORT_SYMBOL(block_dirty_folio); static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) { struct buffer_head *bh; - struct list_head tmp; struct address_space *mapping; int err = 0, err2; struct blk_plug plug; + LIST_HEAD(tmp); - INIT_LIST_HEAD(&tmp); blk_start_plug(&plug); spin_lock(lock); From 1e1613a3a00f21f2e5996067a54de837cde7f03a Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 26 Aug 2024 19:34:04 +0800 Subject: [PATCH 160/168] netfs: Delete subtree of 'fs/netfs' when netfs module exits In netfs_init() or fscache_proc_init(), we create dentry under 'fs/netfs', but in netfs_exit(), we only delete the proc entry of 'fs/netfs' without deleting its subtree. This triggers the following WARNING: ================================================================== remove_proc_entry: removing non-empty directory 'fs/netfs', leaking at least 'requests' WARNING: CPU: 4 PID: 566 at fs/proc/generic.c:717 remove_proc_entry+0x160/0x1c0 Modules linked in: netfs(-) CPU: 4 UID: 0 PID: 566 Comm: rmmod Not tainted 6.11.0-rc3 #860 RIP: 0010:remove_proc_entry+0x160/0x1c0 Call Trace: netfs_exit+0x12/0x620 [netfs] __do_sys_delete_module.isra.0+0x14c/0x2e0 do_syscall_64+0x4b/0x110 entry_SYSCALL_64_after_hwframe+0x76/0x7e ================================================================== Therefore use remove_proc_subtree() instead of remove_proc_entry() to fix the above problem. Fixes: 7eb5b3e3a0a5 ("netfs, fscache: Move /proc/fs/fscache to /proc/fs/netfs and put in a symlink") Cc: stable@kernel.org Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240826113404.3214786-1-libaokun@huaweicloud.com Acked-by: David Howells Signed-off-by: Christian Brauner --- fs/netfs/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/netfs/main.c b/fs/netfs/main.c index 5f0f438e5d21..9d6b49dc6694 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -142,7 +142,7 @@ static int __init netfs_init(void) error_fscache: error_procfile: - remove_proc_entry("fs/netfs", NULL); + remove_proc_subtree("fs/netfs", NULL); error_proc: mempool_exit(&netfs_subrequest_pool); error_subreqpool: @@ -159,7 +159,7 @@ fs_initcall(netfs_init); static void __exit netfs_exit(void) { fscache_exit(); - remove_proc_entry("fs/netfs", NULL); + remove_proc_subtree("fs/netfs", NULL); mempool_exit(&netfs_subrequest_pool); kmem_cache_destroy(netfs_subrequest_slab); mempool_exit(&netfs_request_pool); From bc1b083b55ba8c134baffc766eee695ee4600694 Mon Sep 17 00:00:00 2001 From: Yu Jiaoliang Date: Fri, 23 Aug 2024 09:55:41 +0800 Subject: [PATCH 161/168] mnt_idmapping: Use kmemdup_array instead of kmemdup for multiple allocation Let the kememdup_array() take care about multiplication and possible overflows. v2:Add a new modification for reverse array. Signed-off-by: Yu Jiaoliang Link: https://lore.kernel.org/r/20240823015542.3006262-1-yujiaoliang@vivo.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/mnt_idmapping.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c index 3c60f1eaca61..79491663dbc0 100644 --- a/fs/mnt_idmapping.c +++ b/fs/mnt_idmapping.c @@ -228,15 +228,15 @@ static int copy_mnt_idmap(struct uid_gid_map *map_from, return 0; } - forward = kmemdup(map_from->forward, - nr_extents * sizeof(struct uid_gid_extent), - GFP_KERNEL_ACCOUNT); + forward = kmemdup_array(map_from->forward, nr_extents, + sizeof(struct uid_gid_extent), + GFP_KERNEL_ACCOUNT); if (!forward) return -ENOMEM; - reverse = kmemdup(map_from->reverse, - nr_extents * sizeof(struct uid_gid_extent), - GFP_KERNEL_ACCOUNT); + reverse = kmemdup_array(map_from->reverse, nr_extents, + sizeof(struct uid_gid_extent), + GFP_KERNEL_ACCOUNT); if (!reverse) { kfree(forward); return -ENOMEM; From e57de7d9e7fc1db16df63f6dab5e5d2f450b26f7 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Mon, 26 Aug 2024 13:55:03 +0800 Subject: [PATCH 162/168] fs/inode: Prevent dump_mapping() accessing invalid dentry.d_name.name It's observed that a crash occurs during hot-remove a memory device, in which user is accessing the hugetlb. See calltrace as following: ------------[ cut here ]------------ WARNING: CPU: 1 PID: 14045 at arch/x86/mm/fault.c:1278 do_user_addr_fault+0x2a0/0x790 Modules linked in: kmem device_dax cxl_mem cxl_pmem cxl_port cxl_pci dax_hmem dax_pmem nd_pmem cxl_acpi nd_btt cxl_core crc32c_intel nvme virtiofs fuse nvme_core nfit libnvdimm dm_multipath scsi_dh_rdac scsi_dh_emc s mirror dm_region_hash dm_log dm_mod CPU: 1 PID: 14045 Comm: daxctl Not tainted 6.10.0-rc2-lizhijian+ #492 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 RIP: 0010:do_user_addr_fault+0x2a0/0x790 Code: 48 8b 00 a8 04 0f 84 b5 fe ff ff e9 1c ff ff ff 4c 89 e9 4c 89 e2 be 01 00 00 00 bf 02 00 00 00 e8 b5 ef 24 00 e9 42 fe ff ff <0f> 0b 48 83 c4 08 4c 89 ea 48 89 ee 4c 89 e7 5b 5d 41 5c 41 5d 41 RSP: 0000:ffffc90000a575f0 EFLAGS: 00010046 RAX: ffff88800c303600 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000001000 RSI: ffffffff82504162 RDI: ffffffff824b2c36 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffffc90000a57658 R13: 0000000000001000 R14: ffff88800bc2e040 R15: 0000000000000000 FS: 00007f51cb57d880(0000) GS:ffff88807fd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000001000 CR3: 00000000072e2004 CR4: 00000000001706f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? __warn+0x8d/0x190 ? do_user_addr_fault+0x2a0/0x790 ? report_bug+0x1c3/0x1d0 ? handle_bug+0x3c/0x70 ? exc_invalid_op+0x14/0x70 ? asm_exc_invalid_op+0x16/0x20 ? do_user_addr_fault+0x2a0/0x790 ? exc_page_fault+0x31/0x200 exc_page_fault+0x68/0x200 <...snip...> BUG: unable to handle page fault for address: 0000000000001000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 800000000ad92067 P4D 800000000ad92067 PUD 7677067 PMD 0 Oops: Oops: 0000 [#1] PREEMPT SMP PTI ---[ end trace 0000000000000000 ]--- BUG: unable to handle page fault for address: 0000000000001000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 800000000ad92067 P4D 800000000ad92067 PUD 7677067 PMD 0 Oops: Oops: 0000 [#1] PREEMPT SMP PTI CPU: 1 PID: 14045 Comm: daxctl Kdump: loaded Tainted: G W 6.10.0-rc2-lizhijian+ #492 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 RIP: 0010:dentry_name+0x1f4/0x440 <...snip...> ? dentry_name+0x2fa/0x440 vsnprintf+0x1f3/0x4f0 vprintk_store+0x23a/0x540 vprintk_emit+0x6d/0x330 _printk+0x58/0x80 dump_mapping+0x10b/0x1a0 ? __pfx_free_object_rcu+0x10/0x10 __dump_page+0x26b/0x3e0 ? vprintk_emit+0xe0/0x330 ? _printk+0x58/0x80 ? dump_page+0x17/0x50 dump_page+0x17/0x50 do_migrate_range+0x2f7/0x7f0 ? do_migrate_range+0x42/0x7f0 ? offline_pages+0x2f4/0x8c0 offline_pages+0x60a/0x8c0 memory_subsys_offline+0x9f/0x1c0 ? lockdep_hardirqs_on+0x77/0x100 ? _raw_spin_unlock_irqrestore+0x38/0x60 device_offline+0xe3/0x110 state_store+0x6e/0xc0 kernfs_fop_write_iter+0x143/0x200 vfs_write+0x39f/0x560 ksys_write+0x65/0xf0 do_syscall_64+0x62/0x130 Previously, some sanity check have been done in dump_mapping() before the print facility parsing '%pd' though, it's still possible to run into an invalid dentry.d_name.name. Since dump_mapping() only needs to dump the filename only, retrieve it by itself in a safer way to prevent an unnecessary crash. Note that either retrieving the filename with '%pd' or strncpy_from_kernel_nofault(), the filename could be unreliable. Signed-off-by: Li Zhijian Link: https://lore.kernel.org/r/20240826055503.1522320-1-lizhijian@fujitsu.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/inode.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index aacd05749c1f..af78f515403f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -616,6 +616,7 @@ void dump_mapping(const struct address_space *mapping) struct hlist_node *dentry_first; struct dentry *dentry_ptr; struct dentry dentry; + char fname[64] = {}; unsigned long ino; /* @@ -652,11 +653,14 @@ void dump_mapping(const struct address_space *mapping) return; } + if (strncpy_from_kernel_nofault(fname, dentry.d_name.name, 63) < 0) + strscpy(fname, ""); /* - * if dentry is corrupted, the %pd handler may still crash, - * but it's unlikely that we reach here with a corrupt mapping + * Even if strncpy_from_kernel_nofault() succeeded, + * the fname could be unreliable */ - pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n", a_ops, ino, &dentry); + pr_warn("aops:%ps ino:%lx dentry name(?):\"%s\"\n", + a_ops, ino, fname); } void clear_inode(struct inode *inode) From a24dfa51564256fddc85a834c45bf9e02e599472 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Aug 2024 08:50:45 +0200 Subject: [PATCH 163/168] block: remove checks for FALLOC_FL_NO_HIDE_STALE While the FALLOC_FL_NO_HIDE_STALE value has been registered, it has always been rejected by vfs_fallocate before making it into blkdev_fallocate because it isn't in the supported mask. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240827065123.1762168-2-hch@lst.de Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- block/fops.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/block/fops.c b/block/fops.c index 9825c1713a49..7f48f03a62e9 100644 --- a/block/fops.c +++ b/block/fops.c @@ -771,7 +771,7 @@ reexpand: #define BLKDEV_FALLOC_FL_SUPPORTED \ (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ - FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE) + FALLOC_FL_ZERO_RANGE) static long blkdev_fallocate(struct file *file, int mode, loff_t start, loff_t len) @@ -830,14 +830,6 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, len >> SECTOR_SHIFT, GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK); break; - case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: - error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end); - if (error) - goto fail; - - error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, - len >> SECTOR_SHIFT, GFP_KERNEL); - break; default: error = -EOPNOTSUPP; } From c5a8e54233017cd9a77f8b11fb6a57d6d275e537 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Aug 2024 08:50:46 +0200 Subject: [PATCH 164/168] ext4: remove tracing for FALLOC_FL_NO_HIDE_STALE FALLOC_FL_NO_HIDE_STALE can't make it past vfs_fallocate (and if the flag does what the name implies that's a good thing as it would be highly dangerous). Remove the dead tracing code for it. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240827065123.1762168-3-hch@lst.de Signed-off-by: Christian Brauner --- include/trace/events/ext4.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index cc5e9b7b2b44..156908641e68 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -91,7 +91,6 @@ TRACE_DEFINE_ENUM(ES_REFERENCED_B); #define show_falloc_mode(mode) __print_flags(mode, "|", \ { FALLOC_FL_KEEP_SIZE, "KEEP_SIZE"}, \ { FALLOC_FL_PUNCH_HOLE, "PUNCH_HOLE"}, \ - { FALLOC_FL_NO_HIDE_STALE, "NO_HIDE_STALE"}, \ { FALLOC_FL_COLLAPSE_RANGE, "COLLAPSE_RANGE"}, \ { FALLOC_FL_ZERO_RANGE, "ZERO_RANGE"}) From 2f636906813976f40431cfb4a4af998c4ad4129c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Aug 2024 08:50:47 +0200 Subject: [PATCH 165/168] fs: sort out the fallocate mode vs flag mess The fallocate system call takes a mode argument, but that argument contains a wild mix of exclusive modes and an optional flags. Replace FALLOC_FL_SUPPORTED_MASK with FALLOC_FL_MODE_MASK, which excludes the optional flag bit, so that we can use switch statement on the value to easily enumerate the cases while getting the check for duplicate modes for free. To make this (and in the future the file system implementations) more readable also add a symbolic name for the 0 mode used to allocate blocks. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240827065123.1762168-4-hch@lst.de Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/open.c | 51 ++++++++++++++++++------------------- include/linux/falloc.h | 18 ++++++++----- include/uapi/linux/falloc.h | 1 + 3 files changed, 38 insertions(+), 32 deletions(-) diff --git a/fs/open.c b/fs/open.c index 22adbef7ecc2..daf1b55ca818 100644 --- a/fs/open.c +++ b/fs/open.c @@ -252,40 +252,39 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (offset < 0 || len <= 0) return -EINVAL; - /* Return error if mode is not supported */ - if (mode & ~FALLOC_FL_SUPPORTED_MASK) + if (mode & ~(FALLOC_FL_MODE_MASK | FALLOC_FL_KEEP_SIZE)) return -EOPNOTSUPP; - /* Punch hole and zero range are mutually exclusive */ - if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) == - (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) + /* + * Modes are exclusive, even if that is not obvious from the encoding + * as bit masks and the mix with the flag in the same namespace. + * + * To make things even more complicated, FALLOC_FL_ALLOCATE_RANGE is + * encoded as no bit set. + */ + switch (mode & FALLOC_FL_MODE_MASK) { + case FALLOC_FL_ALLOCATE_RANGE: + case FALLOC_FL_UNSHARE_RANGE: + case FALLOC_FL_ZERO_RANGE: + break; + case FALLOC_FL_PUNCH_HOLE: + if (!(mode & FALLOC_FL_KEEP_SIZE)) + return -EOPNOTSUPP; + break; + case FALLOC_FL_COLLAPSE_RANGE: + case FALLOC_FL_INSERT_RANGE: + if (mode & FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + break; + default: return -EOPNOTSUPP; - - /* Punch hole must have keep size set */ - if ((mode & FALLOC_FL_PUNCH_HOLE) && - !(mode & FALLOC_FL_KEEP_SIZE)) - return -EOPNOTSUPP; - - /* Collapse range should only be used exclusively. */ - if ((mode & FALLOC_FL_COLLAPSE_RANGE) && - (mode & ~FALLOC_FL_COLLAPSE_RANGE)) - return -EINVAL; - - /* Insert range should only be used exclusively. */ - if ((mode & FALLOC_FL_INSERT_RANGE) && - (mode & ~FALLOC_FL_INSERT_RANGE)) - return -EINVAL; - - /* Unshare range should only be used with allocate mode. */ - if ((mode & FALLOC_FL_UNSHARE_RANGE) && - (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE))) - return -EINVAL; + } if (!(file->f_mode & FMODE_WRITE)) return -EBADF; /* - * We can only allow pure fallocate on append only files + * On append-only files only space preallocation is supported. */ if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode)) return -EPERM; diff --git a/include/linux/falloc.h b/include/linux/falloc.h index f3f0b97b1675..3f49f3df6af5 100644 --- a/include/linux/falloc.h +++ b/include/linux/falloc.h @@ -25,12 +25,18 @@ struct space_resv { #define FS_IOC_UNRESVSP64 _IOW('X', 43, struct space_resv) #define FS_IOC_ZERO_RANGE _IOW('X', 57, struct space_resv) -#define FALLOC_FL_SUPPORTED_MASK (FALLOC_FL_KEEP_SIZE | \ - FALLOC_FL_PUNCH_HOLE | \ - FALLOC_FL_COLLAPSE_RANGE | \ - FALLOC_FL_ZERO_RANGE | \ - FALLOC_FL_INSERT_RANGE | \ - FALLOC_FL_UNSHARE_RANGE) +/* + * Mask of all supported fallocate modes. Only one can be set at a time. + * + * In addition to the mode bit, the mode argument can also encode flags. + * FALLOC_FL_KEEP_SIZE is the only supported flag so far. + */ +#define FALLOC_FL_MODE_MASK (FALLOC_FL_ALLOCATE_RANGE | \ + FALLOC_FL_PUNCH_HOLE | \ + FALLOC_FL_COLLAPSE_RANGE | \ + FALLOC_FL_ZERO_RANGE | \ + FALLOC_FL_INSERT_RANGE | \ + FALLOC_FL_UNSHARE_RANGE) /* on ia32 l_start is on a 32-bit boundary */ #if defined(CONFIG_X86_64) diff --git a/include/uapi/linux/falloc.h b/include/uapi/linux/falloc.h index 51398fa57f6c..5810371ed72b 100644 --- a/include/uapi/linux/falloc.h +++ b/include/uapi/linux/falloc.h @@ -2,6 +2,7 @@ #ifndef _UAPI_FALLOC_H_ #define _UAPI_FALLOC_H_ +#define FALLOC_FL_ALLOCATE_RANGE 0x00 /* allocate range */ #define FALLOC_FL_KEEP_SIZE 0x01 /* default is extend size */ #define FALLOC_FL_PUNCH_HOLE 0x02 /* de-allocates range */ #define FALLOC_FL_NO_HIDE_STALE 0x04 /* reserved codepoint */ From 2764727be269d7dd906f782455f7c5184193ea5b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Aug 2024 08:50:48 +0200 Subject: [PATCH 166/168] xfs: call xfs_flush_unmap_range from xfs_free_file_space Call xfs_flush_unmap_range from xfs_free_file_space so that xfs_file_fallocate doesn't have to predict which mode will call it. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240827065123.1762168-5-hch@lst.de Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/xfs/xfs_bmap_util.c | 8 ++++++++ fs/xfs/xfs_file.c | 21 --------------------- 2 files changed, 8 insertions(+), 21 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index fe2e2c930975..187a0dbda24f 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -848,6 +848,14 @@ xfs_free_file_space( if (len <= 0) /* if nothing being freed */ return 0; + /* + * Now AIO and DIO has drained we flush and (if necessary) invalidate + * the cached range over the first operation we are about to run. + */ + error = xfs_flush_unmap_range(ip, offset, len); + if (error) + return error; + startoffset_fsb = XFS_B_TO_FSB(mp, offset); endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 4cdc54dc9686..5b9e49da0601 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -890,27 +890,6 @@ xfs_file_fallocate( */ inode_dio_wait(inode); - /* - * Now AIO and DIO has drained we flush and (if necessary) invalidate - * the cached range over the first operation we are about to run. - * - * We care about zero and collapse here because they both run a hole - * punch over the range first. Because that can zero data, and the range - * of invalidation for the shift operations is much larger, we still do - * the required flush for collapse in xfs_prepare_shift(). - * - * Insert has the same range requirements as collapse, and we extend the - * file first which can zero data. Hence insert has the same - * flush/invalidate requirements as collapse and so they are both - * handled at the right time by xfs_prepare_shift(). - */ - if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE | - FALLOC_FL_COLLAPSE_RANGE)) { - error = xfs_flush_unmap_range(ip, offset, len); - if (error) - goto out_unlock; - } - error = file_modified(file); if (error) goto out_unlock; From 12206b1c423b64e2d1e3633deec069315a928ee2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Aug 2024 08:50:49 +0200 Subject: [PATCH 167/168] xfs: move the xfs_is_always_cow_inode check into xfs_alloc_file_space Move the xfs_is_always_cow_inode check from the caller into xfs_alloc_file_space to prepare for refactoring of xfs_file_fallocate. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240827065123.1762168-6-hch@lst.de Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/xfs/xfs_bmap_util.c | 3 +++ fs/xfs/xfs_file.c | 8 +++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 187a0dbda24f..e9fdebaa40ea 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -653,6 +653,9 @@ xfs_alloc_file_space( xfs_bmbt_irec_t imaps[1], *imapp; int error; + if (xfs_is_always_cow_inode(ip)) + return 0; + trace_xfs_alloc_file_space(ip); if (xfs_is_shutdown(mp)) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 5b9e49da0601..489bc1b173c2 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -987,11 +987,9 @@ xfs_file_fallocate( } } - if (!xfs_is_always_cow_inode(ip)) { - error = xfs_alloc_file_space(ip, offset, len); - if (error) - goto out_unlock; - } + error = xfs_alloc_file_space(ip, offset, len); + if (error) + goto out_unlock; } /* Change file size if needed */ From a0c3802f87a2637aba028f5f7030b3d52cec2727 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Aug 2024 08:50:50 +0200 Subject: [PATCH 168/168] xfs: refactor xfs_file_fallocate Refactor xfs_file_fallocate into separate helpers for each mode, two factors for i_size handling and a single switch statement over the supported modes. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240827065123.1762168-7-hch@lst.de Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/xfs/xfs_file.c | 330 +++++++++++++++++++++++++++++----------------- 1 file changed, 208 insertions(+), 122 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 489bc1b173c2..f6e4912769a0 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -852,6 +852,192 @@ static inline bool xfs_file_sync_writes(struct file *filp) return false; } +static int +xfs_falloc_newsize( + struct file *file, + int mode, + loff_t offset, + loff_t len, + loff_t *new_size) +{ + struct inode *inode = file_inode(file); + + if ((mode & FALLOC_FL_KEEP_SIZE) || offset + len <= i_size_read(inode)) + return 0; + *new_size = offset + len; + return inode_newsize_ok(inode, *new_size); +} + +static int +xfs_falloc_setsize( + struct file *file, + loff_t new_size) +{ + struct iattr iattr = { + .ia_valid = ATTR_SIZE, + .ia_size = new_size, + }; + + if (!new_size) + return 0; + return xfs_vn_setattr_size(file_mnt_idmap(file), file_dentry(file), + &iattr); +} + +static int +xfs_falloc_collapse_range( + struct file *file, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + loff_t new_size = i_size_read(inode) - len; + int error; + + if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len)) + return -EINVAL; + + /* + * There is no need to overlap collapse range with EOF, in which case it + * is effectively a truncate operation + */ + if (offset + len >= i_size_read(inode)) + return -EINVAL; + + error = xfs_collapse_file_space(XFS_I(inode), offset, len); + if (error) + return error; + return xfs_falloc_setsize(file, new_size); +} + +static int +xfs_falloc_insert_range( + struct file *file, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + loff_t isize = i_size_read(inode); + int error; + + if (!xfs_is_falloc_aligned(XFS_I(inode), offset, len)) + return -EINVAL; + + /* + * New inode size must not exceed ->s_maxbytes, accounting for + * possible signed overflow. + */ + if (inode->i_sb->s_maxbytes - isize < len) + return -EFBIG; + + /* Offset should be less than i_size */ + if (offset >= isize) + return -EINVAL; + + error = xfs_falloc_setsize(file, isize + len); + if (error) + return error; + + /* + * Perform hole insertion now that the file size has been updated so + * that if we crash during the operation we don't leave shifted extents + * past EOF and hence losing access to the data that is contained within + * them. + */ + return xfs_insert_file_space(XFS_I(inode), offset, len); +} + +/* + * Punch a hole and prealloc the range. We use a hole punch rather than + * unwritten extent conversion for two reasons: + * + * 1.) Hole punch handles partial block zeroing for us. + * 2.) If prealloc returns ENOSPC, the file range is still zero-valued by + * virtue of the hole punch. + */ +static int +xfs_falloc_zero_range( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + unsigned int blksize = i_blocksize(inode); + loff_t new_size = 0; + int error; + + trace_xfs_zero_file_space(XFS_I(inode)); + + error = xfs_falloc_newsize(file, mode, offset, len, &new_size); + if (error) + return error; + + error = xfs_free_file_space(XFS_I(inode), offset, len); + if (error) + return error; + + len = round_up(offset + len, blksize) - round_down(offset, blksize); + offset = round_down(offset, blksize); + error = xfs_alloc_file_space(XFS_I(inode), offset, len); + if (error) + return error; + return xfs_falloc_setsize(file, new_size); +} + +static int +xfs_falloc_unshare_range( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + loff_t new_size = 0; + int error; + + error = xfs_falloc_newsize(file, mode, offset, len, &new_size); + if (error) + return error; + + error = xfs_reflink_unshare(XFS_I(inode), offset, len); + if (error) + return error; + + error = xfs_alloc_file_space(XFS_I(inode), offset, len); + if (error) + return error; + return xfs_falloc_setsize(file, new_size); +} + +static int +xfs_falloc_allocate_range( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + loff_t new_size = 0; + int error; + + /* + * If always_cow mode we can't use preallocations and thus should not + * create them. + */ + if (xfs_is_always_cow_inode(XFS_I(inode))) + return -EOPNOTSUPP; + + error = xfs_falloc_newsize(file, mode, offset, len, &new_size); + if (error) + return error; + + error = xfs_alloc_file_space(XFS_I(inode), offset, len); + if (error) + return error; + return xfs_falloc_setsize(file, new_size); +} + #define XFS_FALLOC_FL_SUPPORTED \ (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ @@ -868,8 +1054,6 @@ xfs_file_fallocate( struct xfs_inode *ip = XFS_I(inode); long error; uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; - loff_t new_size = 0; - bool do_file_insert = false; if (!S_ISREG(inode->i_mode)) return -EINVAL; @@ -894,129 +1078,31 @@ xfs_file_fallocate( if (error) goto out_unlock; - if (mode & FALLOC_FL_PUNCH_HOLE) { + switch (mode & FALLOC_FL_MODE_MASK) { + case FALLOC_FL_PUNCH_HOLE: error = xfs_free_file_space(ip, offset, len); - if (error) - goto out_unlock; - } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { - if (!xfs_is_falloc_aligned(ip, offset, len)) { - error = -EINVAL; - goto out_unlock; - } - - /* - * There is no need to overlap collapse range with EOF, - * in which case it is effectively a truncate operation - */ - if (offset + len >= i_size_read(inode)) { - error = -EINVAL; - goto out_unlock; - } - - new_size = i_size_read(inode) - len; - - error = xfs_collapse_file_space(ip, offset, len); - if (error) - goto out_unlock; - } else if (mode & FALLOC_FL_INSERT_RANGE) { - loff_t isize = i_size_read(inode); - - if (!xfs_is_falloc_aligned(ip, offset, len)) { - error = -EINVAL; - goto out_unlock; - } - - /* - * New inode size must not exceed ->s_maxbytes, accounting for - * possible signed overflow. - */ - if (inode->i_sb->s_maxbytes - isize < len) { - error = -EFBIG; - goto out_unlock; - } - new_size = isize + len; - - /* Offset should be less than i_size */ - if (offset >= isize) { - error = -EINVAL; - goto out_unlock; - } - do_file_insert = true; - } else { - if (!(mode & FALLOC_FL_KEEP_SIZE) && - offset + len > i_size_read(inode)) { - new_size = offset + len; - error = inode_newsize_ok(inode, new_size); - if (error) - goto out_unlock; - } - - if (mode & FALLOC_FL_ZERO_RANGE) { - /* - * Punch a hole and prealloc the range. We use a hole - * punch rather than unwritten extent conversion for two - * reasons: - * - * 1.) Hole punch handles partial block zeroing for us. - * 2.) If prealloc returns ENOSPC, the file range is - * still zero-valued by virtue of the hole punch. - */ - unsigned int blksize = i_blocksize(inode); - - trace_xfs_zero_file_space(ip); - - error = xfs_free_file_space(ip, offset, len); - if (error) - goto out_unlock; - - len = round_up(offset + len, blksize) - - round_down(offset, blksize); - offset = round_down(offset, blksize); - } else if (mode & FALLOC_FL_UNSHARE_RANGE) { - error = xfs_reflink_unshare(ip, offset, len); - if (error) - goto out_unlock; - } else { - /* - * If always_cow mode we can't use preallocations and - * thus should not create them. - */ - if (xfs_is_always_cow_inode(ip)) { - error = -EOPNOTSUPP; - goto out_unlock; - } - } - - error = xfs_alloc_file_space(ip, offset, len); - if (error) - goto out_unlock; + break; + case FALLOC_FL_COLLAPSE_RANGE: + error = xfs_falloc_collapse_range(file, offset, len); + break; + case FALLOC_FL_INSERT_RANGE: + error = xfs_falloc_insert_range(file, offset, len); + break; + case FALLOC_FL_ZERO_RANGE: + error = xfs_falloc_zero_range(file, mode, offset, len); + break; + case FALLOC_FL_UNSHARE_RANGE: + error = xfs_falloc_unshare_range(file, mode, offset, len); + break; + case FALLOC_FL_ALLOCATE_RANGE: + error = xfs_falloc_allocate_range(file, mode, offset, len); + break; + default: + error = -EOPNOTSUPP; + break; } - /* Change file size if needed */ - if (new_size) { - struct iattr iattr; - - iattr.ia_valid = ATTR_SIZE; - iattr.ia_size = new_size; - error = xfs_vn_setattr_size(file_mnt_idmap(file), - file_dentry(file), &iattr); - if (error) - goto out_unlock; - } - - /* - * Perform hole insertion now that the file size has been - * updated so that if we crash during the operation we don't - * leave shifted extents past EOF and hence losing access to - * the data that is contained within them. - */ - if (do_file_insert) { - error = xfs_insert_file_space(ip, offset, len); - if (error) - goto out_unlock; - } - - if (xfs_file_sync_writes(file)) + if (!error && xfs_file_sync_writes(file)) error = xfs_log_force_inode(ip); out_unlock: