linux/fs/fuse/file.c

3257 lines
80 KiB
C
Raw Normal View History

/*
FUSE: Filesystem in Userspace
Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
This program can be distributed under the terms of the GNU GPL.
See the file COPYING.
*/
#include "fuse_i.h"
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/module.h>
#include <linux/swap.h>
#include <linux/falloc.h>
#include <linux/uio.h>
#include <linux/fs.h>
#include <linux/filelock.h>
static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
unsigned int open_flags, int opcode,
struct fuse_open_out *outargp)
{
struct fuse_open_in inarg;
FUSE_ARGS(args);
memset(&inarg, 0, sizeof(inarg));
inarg.flags = open_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
if (!fm->fc->atomic_o_trunc)
inarg.flags &= ~O_TRUNC;
if (fm->fc->handle_killpriv_v2 &&
(inarg.flags & O_TRUNC) && !capable(CAP_FSETID)) {
inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID;
}
args.opcode = opcode;
args.nodeid = nodeid;
args.in_numargs = 1;
args.in_args[0].size = sizeof(inarg);
args.in_args[0].value = &inarg;
args.out_numargs = 1;
args.out_args[0].size = sizeof(*outargp);
args.out_args[0].value = outargp;
return fuse_simple_request(fm, &args);
}
struct fuse_release_args {
struct fuse_args args;
struct fuse_release_in inarg;
struct inode *inode;
};
struct fuse_file *fuse_file_alloc(struct fuse_mount *fm)
{
struct fuse_file *ff;
ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL_ACCOUNT);
if (unlikely(!ff))
return NULL;
ff->fm = fm;
ff->release_args = kzalloc(sizeof(*ff->release_args),
GFP_KERNEL_ACCOUNT);
if (!ff->release_args) {
kfree(ff);
return NULL;
}
INIT_LIST_HEAD(&ff->write_entry);
mutex_init(&ff->readdir.lock);
refcount_set(&ff->count, 1);
RB_CLEAR_NODE(&ff->polled_node);
init_waitqueue_head(&ff->poll_wait);
ff->kh = atomic64_inc_return(&fm->fc->khctr);
return ff;
}
void fuse_file_free(struct fuse_file *ff)
{
kfree(ff->release_args);
mutex_destroy(&ff->readdir.lock);
kfree(ff);
}
static struct fuse_file *fuse_file_get(struct fuse_file *ff)
{
refcount_inc(&ff->count);
return ff;
}
static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args,
int error)
{
struct fuse_release_args *ra = container_of(args, typeof(*ra), args);
iput(ra->inode);
kfree(ra);
}
static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
{
if (refcount_dec_and_test(&ff->count)) {
struct fuse_args *args = &ff->release_args->args;
if (isdir ? ff->fm->fc->no_opendir : ff->fm->fc->no_open) {
/* Do nothing when client does not implement 'open' */
fuse_release_end(ff->fm, args, 0);
} else if (sync) {
fuse_simple_request(ff->fm, args);
fuse_release_end(ff->fm, args, 0);
} else {
args->end = fuse_release_end;
if (fuse_simple_background(ff->fm, args,
GFP_KERNEL | __GFP_NOFAIL))
fuse_release_end(ff->fm, args, -ENOTCONN);
}
kfree(ff);
}
}
struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
unsigned int open_flags, bool isdir)
{
struct fuse_conn *fc = fm->fc;
struct fuse_file *ff;
int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
ff = fuse_file_alloc(fm);
if (!ff)
return ERR_PTR(-ENOMEM);
ff->fh = 0;
/* Default for no-open */
ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0);
if (isdir ? !fc->no_opendir : !fc->no_open) {
struct fuse_open_out outarg;
int err;
err = fuse_send_open(fm, nodeid, open_flags, opcode, &outarg);
if (!err) {
ff->fh = outarg.fh;
ff->open_flags = outarg.open_flags;
} else if (err != -ENOSYS) {
fuse_file_free(ff);
return ERR_PTR(err);
} else {
if (isdir)
fc->no_opendir = 1;
else
fc->no_open = 1;
}
}
if (isdir)
ff->open_flags &= ~FOPEN_DIRECT_IO;
ff->nodeid = nodeid;
return ff;
}
int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
bool isdir)
{
struct fuse_file *ff = fuse_file_open(fm, nodeid, file->f_flags, isdir);
if (!IS_ERR(ff))
file->private_data = ff;
return PTR_ERR_OR_ZERO(ff);
}
EXPORT_SYMBOL_GPL(fuse_do_open);
static void fuse_link_write_file(struct file *file)
{
struct inode *inode = file_inode(file);
struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_file *ff = file->private_data;
/*
* file may be written through mmap, so chain it onto the
* inodes's write_file list
*/
spin_lock(&fi->lock);
if (list_empty(&ff->write_entry))
list_add(&ff->write_entry, &fi->write_files);
spin_unlock(&fi->lock);
}
void fuse_finish_open(struct inode *inode, struct file *file)
{
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = get_fuse_conn(inode);
fuse: Add FOPEN_STREAM to use stream_open() Starting from commit 9c225f2655e3 ("vfs: atomic f_pos accesses as per POSIX") files opened even via nonseekable_open gate read and write via lock and do not allow them to be run simultaneously. This can create read vs write deadlock if a filesystem is trying to implement a socket-like file which is intended to be simultaneously used for both read and write from filesystem client. See commit 10dce8af3422 ("fs: stream_open - opener for stream-like files so that read and write can run simultaneously without deadlock") for details and e.g. commit 581d21a2d02a ("xenbus: fix deadlock on writes to /proc/xen/xenbus") for a similar deadlock example on /proc/xen/xenbus. To avoid such deadlock it was tempting to adjust fuse_finish_open to use stream_open instead of nonseekable_open on just FOPEN_NONSEEKABLE flags, but grepping through Debian codesearch shows users of FOPEN_NONSEEKABLE, and in particular GVFS which actually uses offset in its read and write handlers https://codesearch.debian.net/search?q=-%3Enonseekable+%3D https://gitlab.gnome.org/GNOME/gvfs/blob/1.40.0-6-gcbc54396/client/gvfsfusedaemon.c#L1080 https://gitlab.gnome.org/GNOME/gvfs/blob/1.40.0-6-gcbc54396/client/gvfsfusedaemon.c#L1247-1346 https://gitlab.gnome.org/GNOME/gvfs/blob/1.40.0-6-gcbc54396/client/gvfsfusedaemon.c#L1399-1481 so if we would do such a change it will break a real user. Add another flag (FOPEN_STREAM) for filesystem servers to indicate that the opened handler is having stream-like semantics; does not use file position and thus the kernel is free to issue simultaneous read and write request on opened file handle. This patch together with stream_open() should be added to stable kernels starting from v3.14+. This will allow to patch OSSPD and other FUSE filesystems that provide stream-like files to return FOPEN_STREAM | FOPEN_NONSEEKABLE in open handler and this way avoid the deadlock on all kernel versions. This should work because fuse_finish_open ignores unknown open flags returned from a filesystem and so passing FOPEN_STREAM to a kernel that is not aware of this flag cannot hurt. In turn the kernel that is not aware of FOPEN_STREAM will be < v3.14 where just FOPEN_NONSEEKABLE is sufficient to implement streams without read vs write deadlock. Cc: stable@vger.kernel.org # v3.14+ Signed-off-by: Kirill Smelkov <kirr@nexedi.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2019-04-24 15:13:57 +08:00
if (ff->open_flags & FOPEN_STREAM)
stream_open(inode, file);
else if (ff->open_flags & FOPEN_NONSEEKABLE)
nonseekable_open(inode, file);
if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
struct fuse_inode *fi = get_fuse_inode(inode);
spin_lock(&fi->lock);
fi->attr_version = atomic64_inc_return(&fc->attr_version);
i_size_write(inode, 0);
spin_unlock(&fi->lock);
file_update_time(file);
fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
}
if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
fuse_link_write_file(file);
}
int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
{
struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_conn *fc = fm->fc;
int err;
bool is_wb_truncate = (file->f_flags & O_TRUNC) &&
fc->atomic_o_trunc &&
fc->writeback_cache;
virtiofs: serialize truncate/punch_hole and dax fault path Currently in fuse we don't seem have any lock which can serialize fault path with truncate/punch_hole path. With dax support I need one for following reasons. 1. Dax requirement DAX fault code relies on inode size being stable for the duration of fault and want to serialize with truncate/punch_hole and they explicitly mention it. static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) /* * Check whether offset isn't beyond end of file now. Caller is * supposed to hold locks serializing us with truncate / punch hole so * this is a reliable test. */ max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 2. Make sure there are no users of pages being truncated/punch_hole get_user_pages() might take references to page and then do some DMA to said pages. Filesystem might truncate those pages without knowing that a DMA is in progress or some I/O is in progress. So use dax_layout_busy_page() to make sure there are no such references and I/O is not in progress on said pages before moving ahead with truncation. 3. Limitation of kvm page fault error reporting If we are truncating file on host first and then removing mappings in guest lateter (truncate page cache etc), then this could lead to a problem with KVM. Say a mapping is in place in guest and truncation happens on host. Now if guest accesses that mapping, then host will take a fault and kvm will either exit to qemu or spin infinitely. IOW, before we do truncation on host, we need to make sure that guest inode does not have any mapping in that region or whole file. 4. virtiofs memory range reclaim Soon I will introduce the notion of being able to reclaim dax memory ranges from a fuse dax inode. There also I need to make sure that no I/O or fault is going on in the reclaimed range and nobody is using it so that range can be reclaimed without issues. Currently if we take inode lock, that serializes read/write. But it does not do anything for faults. So I add another semaphore fuse_inode->i_mmap_sem for this purpose. It can be used to serialize with faults. As of now, I am adding taking this semaphore only in dax fault path and not regular fault path because existing code does not have one. May be existing code can benefit from it as well to take care of some races, but that we can fix later if need be. For now, I am just focussing only on DAX path which is new path. Also added logic to take fuse_inode->i_mmap_sem in truncate/punch_hole/open(O_TRUNC) path to make sure file truncation and fuse dax fault are mutually exlusive and avoid all the above problems. Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Cc: Dave Chinner <david@fromorbit.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-08-20 06:19:54 +08:00
bool dax_truncate = (file->f_flags & O_TRUNC) &&
fc->atomic_o_trunc && FUSE_IS_DAX(inode);
if (fuse_is_bad(inode))
return -EIO;
err = generic_file_open(inode, file);
if (err)
return err;
fuse: fix deadlock between atomic O_TRUNC and page invalidation fuse_finish_open() will be called with FUSE_NOWRITE set in case of atomic O_TRUNC open(), so commit 76224355db75 ("fuse: truncate pagecache on atomic_o_trunc") replaced invalidate_inode_pages2() by truncate_pagecache() in such a case to avoid the A-A deadlock. However, we found another A-B-B-A deadlock related to the case above, which will cause the xfstests generic/464 testcase hung in our virtio-fs test environment. For example, consider two processes concurrently open one same file, one with O_TRUNC and another without O_TRUNC. The deadlock case is described below, if open(O_TRUNC) is already set_nowrite(acquired A), and is trying to lock a page (acquiring B), open() could have held the page lock (acquired B), and waiting on the page writeback (acquiring A). This would lead to deadlocks. open(O_TRUNC) ---------------------------------------------------------------- fuse_open_common inode_lock [C acquire] fuse_set_nowrite [A acquire] fuse_finish_open truncate_pagecache lock_page [B acquire] truncate_inode_page unlock_page [B release] fuse_release_nowrite [A release] inode_unlock [C release] ---------------------------------------------------------------- open() ---------------------------------------------------------------- fuse_open_common fuse_finish_open invalidate_inode_pages2 lock_page [B acquire] fuse_launder_page fuse_wait_on_page_writeback [A acquire & release] unlock_page [B release] ---------------------------------------------------------------- Besides this case, all calls of invalidate_inode_pages2() and invalidate_inode_pages2_range() in fuse code also can deadlock with open(O_TRUNC). Fix by moving the truncate_pagecache() call outside the nowrite protected region. The nowrite protection is only for delayed writeback (writeback_cache) case, where inode lock does not protect against truncation racing with writes on the server. Write syscalls racing with page cache truncation still get the inode lock protection. This patch also changes the order of filemap_invalidate_lock() vs. fuse_set_nowrite() in fuse_open_common(). This new order matches the order found in fuse_file_fallocate() and fuse_do_setattr(). Reported-by: Jiachen Zhang <zhangjiachen.jaycee@bytedance.com> Tested-by: Jiachen Zhang <zhangjiachen.jaycee@bytedance.com> Fixes: e4648309b85a ("fuse: truncate pending writes on O_TRUNC") Cc: <stable@vger.kernel.org> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2022-04-22 21:48:53 +08:00
if (is_wb_truncate || dax_truncate)
inode_lock(inode);
virtiofs: serialize truncate/punch_hole and dax fault path Currently in fuse we don't seem have any lock which can serialize fault path with truncate/punch_hole path. With dax support I need one for following reasons. 1. Dax requirement DAX fault code relies on inode size being stable for the duration of fault and want to serialize with truncate/punch_hole and they explicitly mention it. static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) /* * Check whether offset isn't beyond end of file now. Caller is * supposed to hold locks serializing us with truncate / punch hole so * this is a reliable test. */ max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 2. Make sure there are no users of pages being truncated/punch_hole get_user_pages() might take references to page and then do some DMA to said pages. Filesystem might truncate those pages without knowing that a DMA is in progress or some I/O is in progress. So use dax_layout_busy_page() to make sure there are no such references and I/O is not in progress on said pages before moving ahead with truncation. 3. Limitation of kvm page fault error reporting If we are truncating file on host first and then removing mappings in guest lateter (truncate page cache etc), then this could lead to a problem with KVM. Say a mapping is in place in guest and truncation happens on host. Now if guest accesses that mapping, then host will take a fault and kvm will either exit to qemu or spin infinitely. IOW, before we do truncation on host, we need to make sure that guest inode does not have any mapping in that region or whole file. 4. virtiofs memory range reclaim Soon I will introduce the notion of being able to reclaim dax memory ranges from a fuse dax inode. There also I need to make sure that no I/O or fault is going on in the reclaimed range and nobody is using it so that range can be reclaimed without issues. Currently if we take inode lock, that serializes read/write. But it does not do anything for faults. So I add another semaphore fuse_inode->i_mmap_sem for this purpose. It can be used to serialize with faults. As of now, I am adding taking this semaphore only in dax fault path and not regular fault path because existing code does not have one. May be existing code can benefit from it as well to take care of some races, but that we can fix later if need be. For now, I am just focussing only on DAX path which is new path. Also added logic to take fuse_inode->i_mmap_sem in truncate/punch_hole/open(O_TRUNC) path to make sure file truncation and fuse dax fault are mutually exlusive and avoid all the above problems. Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Cc: Dave Chinner <david@fromorbit.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-08-20 06:19:54 +08:00
if (dax_truncate) {
filemap_invalidate_lock(inode->i_mapping);
virtiofs: serialize truncate/punch_hole and dax fault path Currently in fuse we don't seem have any lock which can serialize fault path with truncate/punch_hole path. With dax support I need one for following reasons. 1. Dax requirement DAX fault code relies on inode size being stable for the duration of fault and want to serialize with truncate/punch_hole and they explicitly mention it. static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) /* * Check whether offset isn't beyond end of file now. Caller is * supposed to hold locks serializing us with truncate / punch hole so * this is a reliable test. */ max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 2. Make sure there are no users of pages being truncated/punch_hole get_user_pages() might take references to page and then do some DMA to said pages. Filesystem might truncate those pages without knowing that a DMA is in progress or some I/O is in progress. So use dax_layout_busy_page() to make sure there are no such references and I/O is not in progress on said pages before moving ahead with truncation. 3. Limitation of kvm page fault error reporting If we are truncating file on host first and then removing mappings in guest lateter (truncate page cache etc), then this could lead to a problem with KVM. Say a mapping is in place in guest and truncation happens on host. Now if guest accesses that mapping, then host will take a fault and kvm will either exit to qemu or spin infinitely. IOW, before we do truncation on host, we need to make sure that guest inode does not have any mapping in that region or whole file. 4. virtiofs memory range reclaim Soon I will introduce the notion of being able to reclaim dax memory ranges from a fuse dax inode. There also I need to make sure that no I/O or fault is going on in the reclaimed range and nobody is using it so that range can be reclaimed without issues. Currently if we take inode lock, that serializes read/write. But it does not do anything for faults. So I add another semaphore fuse_inode->i_mmap_sem for this purpose. It can be used to serialize with faults. As of now, I am adding taking this semaphore only in dax fault path and not regular fault path because existing code does not have one. May be existing code can benefit from it as well to take care of some races, but that we can fix later if need be. For now, I am just focussing only on DAX path which is new path. Also added logic to take fuse_inode->i_mmap_sem in truncate/punch_hole/open(O_TRUNC) path to make sure file truncation and fuse dax fault are mutually exlusive and avoid all the above problems. Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Cc: Dave Chinner <david@fromorbit.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-08-20 06:19:54 +08:00
err = fuse_dax_break_layouts(inode, 0, 0);
if (err)
fuse: fix deadlock between atomic O_TRUNC and page invalidation fuse_finish_open() will be called with FUSE_NOWRITE set in case of atomic O_TRUNC open(), so commit 76224355db75 ("fuse: truncate pagecache on atomic_o_trunc") replaced invalidate_inode_pages2() by truncate_pagecache() in such a case to avoid the A-A deadlock. However, we found another A-B-B-A deadlock related to the case above, which will cause the xfstests generic/464 testcase hung in our virtio-fs test environment. For example, consider two processes concurrently open one same file, one with O_TRUNC and another without O_TRUNC. The deadlock case is described below, if open(O_TRUNC) is already set_nowrite(acquired A), and is trying to lock a page (acquiring B), open() could have held the page lock (acquired B), and waiting on the page writeback (acquiring A). This would lead to deadlocks. open(O_TRUNC) ---------------------------------------------------------------- fuse_open_common inode_lock [C acquire] fuse_set_nowrite [A acquire] fuse_finish_open truncate_pagecache lock_page [B acquire] truncate_inode_page unlock_page [B release] fuse_release_nowrite [A release] inode_unlock [C release] ---------------------------------------------------------------- open() ---------------------------------------------------------------- fuse_open_common fuse_finish_open invalidate_inode_pages2 lock_page [B acquire] fuse_launder_page fuse_wait_on_page_writeback [A acquire & release] unlock_page [B release] ---------------------------------------------------------------- Besides this case, all calls of invalidate_inode_pages2() and invalidate_inode_pages2_range() in fuse code also can deadlock with open(O_TRUNC). Fix by moving the truncate_pagecache() call outside the nowrite protected region. The nowrite protection is only for delayed writeback (writeback_cache) case, where inode lock does not protect against truncation racing with writes on the server. Write syscalls racing with page cache truncation still get the inode lock protection. This patch also changes the order of filemap_invalidate_lock() vs. fuse_set_nowrite() in fuse_open_common(). This new order matches the order found in fuse_file_fallocate() and fuse_do_setattr(). Reported-by: Jiachen Zhang <zhangjiachen.jaycee@bytedance.com> Tested-by: Jiachen Zhang <zhangjiachen.jaycee@bytedance.com> Fixes: e4648309b85a ("fuse: truncate pending writes on O_TRUNC") Cc: <stable@vger.kernel.org> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2022-04-22 21:48:53 +08:00
goto out_inode_unlock;
virtiofs: serialize truncate/punch_hole and dax fault path Currently in fuse we don't seem have any lock which can serialize fault path with truncate/punch_hole path. With dax support I need one for following reasons. 1. Dax requirement DAX fault code relies on inode size being stable for the duration of fault and want to serialize with truncate/punch_hole and they explicitly mention it. static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) /* * Check whether offset isn't beyond end of file now. Caller is * supposed to hold locks serializing us with truncate / punch hole so * this is a reliable test. */ max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 2. Make sure there are no users of pages being truncated/punch_hole get_user_pages() might take references to page and then do some DMA to said pages. Filesystem might truncate those pages without knowing that a DMA is in progress or some I/O is in progress. So use dax_layout_busy_page() to make sure there are no such references and I/O is not in progress on said pages before moving ahead with truncation. 3. Limitation of kvm page fault error reporting If we are truncating file on host first and then removing mappings in guest lateter (truncate page cache etc), then this could lead to a problem with KVM. Say a mapping is in place in guest and truncation happens on host. Now if guest accesses that mapping, then host will take a fault and kvm will either exit to qemu or spin infinitely. IOW, before we do truncation on host, we need to make sure that guest inode does not have any mapping in that region or whole file. 4. virtiofs memory range reclaim Soon I will introduce the notion of being able to reclaim dax memory ranges from a fuse dax inode. There also I need to make sure that no I/O or fault is going on in the reclaimed range and nobody is using it so that range can be reclaimed without issues. Currently if we take inode lock, that serializes read/write. But it does not do anything for faults. So I add another semaphore fuse_inode->i_mmap_sem for this purpose. It can be used to serialize with faults. As of now, I am adding taking this semaphore only in dax fault path and not regular fault path because existing code does not have one. May be existing code can benefit from it as well to take care of some races, but that we can fix later if need be. For now, I am just focussing only on DAX path which is new path. Also added logic to take fuse_inode->i_mmap_sem in truncate/punch_hole/open(O_TRUNC) path to make sure file truncation and fuse dax fault are mutually exlusive and avoid all the above problems. Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Cc: Dave Chinner <david@fromorbit.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-08-20 06:19:54 +08:00
}
fuse: fix deadlock between atomic O_TRUNC and page invalidation fuse_finish_open() will be called with FUSE_NOWRITE set in case of atomic O_TRUNC open(), so commit 76224355db75 ("fuse: truncate pagecache on atomic_o_trunc") replaced invalidate_inode_pages2() by truncate_pagecache() in such a case to avoid the A-A deadlock. However, we found another A-B-B-A deadlock related to the case above, which will cause the xfstests generic/464 testcase hung in our virtio-fs test environment. For example, consider two processes concurrently open one same file, one with O_TRUNC and another without O_TRUNC. The deadlock case is described below, if open(O_TRUNC) is already set_nowrite(acquired A), and is trying to lock a page (acquiring B), open() could have held the page lock (acquired B), and waiting on the page writeback (acquiring A). This would lead to deadlocks. open(O_TRUNC) ---------------------------------------------------------------- fuse_open_common inode_lock [C acquire] fuse_set_nowrite [A acquire] fuse_finish_open truncate_pagecache lock_page [B acquire] truncate_inode_page unlock_page [B release] fuse_release_nowrite [A release] inode_unlock [C release] ---------------------------------------------------------------- open() ---------------------------------------------------------------- fuse_open_common fuse_finish_open invalidate_inode_pages2 lock_page [B acquire] fuse_launder_page fuse_wait_on_page_writeback [A acquire & release] unlock_page [B release] ---------------------------------------------------------------- Besides this case, all calls of invalidate_inode_pages2() and invalidate_inode_pages2_range() in fuse code also can deadlock with open(O_TRUNC). Fix by moving the truncate_pagecache() call outside the nowrite protected region. The nowrite protection is only for delayed writeback (writeback_cache) case, where inode lock does not protect against truncation racing with writes on the server. Write syscalls racing with page cache truncation still get the inode lock protection. This patch also changes the order of filemap_invalidate_lock() vs. fuse_set_nowrite() in fuse_open_common(). This new order matches the order found in fuse_file_fallocate() and fuse_do_setattr(). Reported-by: Jiachen Zhang <zhangjiachen.jaycee@bytedance.com> Tested-by: Jiachen Zhang <zhangjiachen.jaycee@bytedance.com> Fixes: e4648309b85a ("fuse: truncate pending writes on O_TRUNC") Cc: <stable@vger.kernel.org> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2022-04-22 21:48:53 +08:00
if (is_wb_truncate || dax_truncate)
fuse_set_nowrite(inode);
err = fuse_do_open(fm, get_node_id(inode), file, isdir);
if (!err)
fuse_finish_open(inode, file);
fuse: fix deadlock between atomic O_TRUNC and page invalidation fuse_finish_open() will be called with FUSE_NOWRITE set in case of atomic O_TRUNC open(), so commit 76224355db75 ("fuse: truncate pagecache on atomic_o_trunc") replaced invalidate_inode_pages2() by truncate_pagecache() in such a case to avoid the A-A deadlock. However, we found another A-B-B-A deadlock related to the case above, which will cause the xfstests generic/464 testcase hung in our virtio-fs test environment. For example, consider two processes concurrently open one same file, one with O_TRUNC and another without O_TRUNC. The deadlock case is described below, if open(O_TRUNC) is already set_nowrite(acquired A), and is trying to lock a page (acquiring B), open() could have held the page lock (acquired B), and waiting on the page writeback (acquiring A). This would lead to deadlocks. open(O_TRUNC) ---------------------------------------------------------------- fuse_open_common inode_lock [C acquire] fuse_set_nowrite [A acquire] fuse_finish_open truncate_pagecache lock_page [B acquire] truncate_inode_page unlock_page [B release] fuse_release_nowrite [A release] inode_unlock [C release] ---------------------------------------------------------------- open() ---------------------------------------------------------------- fuse_open_common fuse_finish_open invalidate_inode_pages2 lock_page [B acquire] fuse_launder_page fuse_wait_on_page_writeback [A acquire & release] unlock_page [B release] ---------------------------------------------------------------- Besides this case, all calls of invalidate_inode_pages2() and invalidate_inode_pages2_range() in fuse code also can deadlock with open(O_TRUNC). Fix by moving the truncate_pagecache() call outside the nowrite protected region. The nowrite protection is only for delayed writeback (writeback_cache) case, where inode lock does not protect against truncation racing with writes on the server. Write syscalls racing with page cache truncation still get the inode lock protection. This patch also changes the order of filemap_invalidate_lock() vs. fuse_set_nowrite() in fuse_open_common(). This new order matches the order found in fuse_file_fallocate() and fuse_do_setattr(). Reported-by: Jiachen Zhang <zhangjiachen.jaycee@bytedance.com> Tested-by: Jiachen Zhang <zhangjiachen.jaycee@bytedance.com> Fixes: e4648309b85a ("fuse: truncate pending writes on O_TRUNC") Cc: <stable@vger.kernel.org> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2022-04-22 21:48:53 +08:00
if (is_wb_truncate || dax_truncate)
fuse_release_nowrite(inode);
if (!err) {
struct fuse_file *ff = file->private_data;
if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC))
truncate_pagecache(inode, 0);
else if (!(ff->open_flags & FOPEN_KEEP_CACHE))
invalidate_inode_pages2(inode->i_mapping);
}
virtiofs: serialize truncate/punch_hole and dax fault path Currently in fuse we don't seem have any lock which can serialize fault path with truncate/punch_hole path. With dax support I need one for following reasons. 1. Dax requirement DAX fault code relies on inode size being stable for the duration of fault and want to serialize with truncate/punch_hole and they explicitly mention it. static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) /* * Check whether offset isn't beyond end of file now. Caller is * supposed to hold locks serializing us with truncate / punch hole so * this is a reliable test. */ max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 2. Make sure there are no users of pages being truncated/punch_hole get_user_pages() might take references to page and then do some DMA to said pages. Filesystem might truncate those pages without knowing that a DMA is in progress or some I/O is in progress. So use dax_layout_busy_page() to make sure there are no such references and I/O is not in progress on said pages before moving ahead with truncation. 3. Limitation of kvm page fault error reporting If we are truncating file on host first and then removing mappings in guest lateter (truncate page cache etc), then this could lead to a problem with KVM. Say a mapping is in place in guest and truncation happens on host. Now if guest accesses that mapping, then host will take a fault and kvm will either exit to qemu or spin infinitely. IOW, before we do truncation on host, we need to make sure that guest inode does not have any mapping in that region or whole file. 4. virtiofs memory range reclaim Soon I will introduce the notion of being able to reclaim dax memory ranges from a fuse dax inode. There also I need to make sure that no I/O or fault is going on in the reclaimed range and nobody is using it so that range can be reclaimed without issues. Currently if we take inode lock, that serializes read/write. But it does not do anything for faults. So I add another semaphore fuse_inode->i_mmap_sem for this purpose. It can be used to serialize with faults. As of now, I am adding taking this semaphore only in dax fault path and not regular fault path because existing code does not have one. May be existing code can benefit from it as well to take care of some races, but that we can fix later if need be. For now, I am just focussing only on DAX path which is new path. Also added logic to take fuse_inode->i_mmap_sem in truncate/punch_hole/open(O_TRUNC) path to make sure file truncation and fuse dax fault are mutually exlusive and avoid all the above problems. Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Cc: Dave Chinner <david@fromorbit.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-08-20 06:19:54 +08:00
if (dax_truncate)
filemap_invalidate_unlock(inode->i_mapping);
fuse: fix deadlock between atomic O_TRUNC and page invalidation fuse_finish_open() will be called with FUSE_NOWRITE set in case of atomic O_TRUNC open(), so commit 76224355db75 ("fuse: truncate pagecache on atomic_o_trunc") replaced invalidate_inode_pages2() by truncate_pagecache() in such a case to avoid the A-A deadlock. However, we found another A-B-B-A deadlock related to the case above, which will cause the xfstests generic/464 testcase hung in our virtio-fs test environment. For example, consider two processes concurrently open one same file, one with O_TRUNC and another without O_TRUNC. The deadlock case is described below, if open(O_TRUNC) is already set_nowrite(acquired A), and is trying to lock a page (acquiring B), open() could have held the page lock (acquired B), and waiting on the page writeback (acquiring A). This would lead to deadlocks. open(O_TRUNC) ---------------------------------------------------------------- fuse_open_common inode_lock [C acquire] fuse_set_nowrite [A acquire] fuse_finish_open truncate_pagecache lock_page [B acquire] truncate_inode_page unlock_page [B release] fuse_release_nowrite [A release] inode_unlock [C release] ---------------------------------------------------------------- open() ---------------------------------------------------------------- fuse_open_common fuse_finish_open invalidate_inode_pages2 lock_page [B acquire] fuse_launder_page fuse_wait_on_page_writeback [A acquire & release] unlock_page [B release] ---------------------------------------------------------------- Besides this case, all calls of invalidate_inode_pages2() and invalidate_inode_pages2_range() in fuse code also can deadlock with open(O_TRUNC). Fix by moving the truncate_pagecache() call outside the nowrite protected region. The nowrite protection is only for delayed writeback (writeback_cache) case, where inode lock does not protect against truncation racing with writes on the server. Write syscalls racing with page cache truncation still get the inode lock protection. This patch also changes the order of filemap_invalidate_lock() vs. fuse_set_nowrite() in fuse_open_common(). This new order matches the order found in fuse_file_fallocate() and fuse_do_setattr(). Reported-by: Jiachen Zhang <zhangjiachen.jaycee@bytedance.com> Tested-by: Jiachen Zhang <zhangjiachen.jaycee@bytedance.com> Fixes: e4648309b85a ("fuse: truncate pending writes on O_TRUNC") Cc: <stable@vger.kernel.org> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2022-04-22 21:48:53 +08:00
out_inode_unlock:
if (is_wb_truncate || dax_truncate)
inode_unlock(inode);
return err;
}
static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
unsigned int flags, int opcode)
{
struct fuse_conn *fc = ff->fm->fc;
struct fuse_release_args *ra = ff->release_args;
/* Inode is NULL on error path of fuse_create_open() */
if (likely(fi)) {
spin_lock(&fi->lock);
list_del(&ff->write_entry);
spin_unlock(&fi->lock);
}
spin_lock(&fc->lock);
if (!RB_EMPTY_NODE(&ff->polled_node))
rb_erase(&ff->polled_node, &fc->polled_files);
spin_unlock(&fc->lock);
wake_up_interruptible_all(&ff->poll_wait);
ra->inarg.fh = ff->fh;
ra->inarg.flags = flags;
ra->args.in_numargs = 1;
ra->args.in_args[0].size = sizeof(struct fuse_release_in);
ra->args.in_args[0].value = &ra->inarg;
ra->args.opcode = opcode;
ra->args.nodeid = ff->nodeid;
ra->args.force = true;
ra->args.nocreds = true;
}
void fuse_file_release(struct inode *inode, struct fuse_file *ff,
unsigned int open_flags, fl_owner_t id, bool isdir)
{
struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_release_args *ra = ff->release_args;
int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
fuse_prepare_release(fi, ff, open_flags, opcode);
if (ff->flock) {
ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, id);
}
/* Hold inode until release is finished */
ra->inode = igrab(inode);
/*
* Normally this will send the RELEASE request, however if
* some asynchronous READ or WRITE requests are outstanding,
* the sending will be delayed.
*
* Make the release synchronous if this is a fuseblk mount,
* synchronous RELEASE is allowed (and desirable) in this case
* because the server can be trusted not to screw up.
*/
fuse_file_put(ff, ff->fm->fc->destroy, isdir);
}
void fuse_release_common(struct file *file, bool isdir)
{
fuse_file_release(file_inode(file), file->private_data, file->f_flags,
(fl_owner_t) file, isdir);
}
static int fuse_open(struct inode *inode, struct file *file)
{
return fuse_open_common(inode, file, false);
}
static int fuse_release(struct inode *inode, struct file *file)
{
struct fuse_conn *fc = get_fuse_conn(inode);
/*
* Dirty pages might remain despite write_inode_now() call from
* fuse_flush() due to writes racing with the close.
*/
if (fc->writeback_cache)
write_inode_now(inode, 1);
fuse_release_common(file, false);
/* return value is ignored by VFS */
return 0;
}
void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff,
unsigned int flags)
{
WARN_ON(refcount_read(&ff->count) > 1);
fuse_prepare_release(fi, ff, flags, FUSE_RELEASE);
/*
* iput(NULL) is a no-op and since the refcount is 1 and everything's
* synchronous, we are fine with not doing igrab() here"
*/
fuse_file_put(ff, true, false);
}
EXPORT_SYMBOL_GPL(fuse_sync_release);
/*
* Scramble the ID space with XTEA, so that the value of the files_struct
* pointer is not exposed to userspace.
*/
u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
{
u32 *k = fc->scramble_key;
u64 v = (unsigned long) id;
u32 v0 = v;
u32 v1 = v >> 32;
u32 sum = 0;
int i;
for (i = 0; i < 32; i++) {
v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
sum += 0x9E3779B9;
v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
}
return (u64) v0 + ((u64) v1 << 32);
}
struct fuse_writepage_args {
struct fuse_io_args ia;
struct rb_node writepages_entry;
struct list_head queue_entry;
struct fuse_writepage_args *next;
struct inode *inode;
struct fuse_sync_bucket *bucket;
};
static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
pgoff_t idx_from, pgoff_t idx_to)
{
struct rb_node *n;
n = fi->writepages.rb_node;
while (n) {
struct fuse_writepage_args *wpa;
pgoff_t curr_index;
wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry);
WARN_ON(get_fuse_inode(wpa->inode) != fi);
curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT;
if (idx_from >= curr_index + wpa->ia.ap.num_pages)
n = n->rb_right;
else if (idx_to < curr_index)
n = n->rb_left;
else
return wpa;
}
return NULL;
}
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
/*
* Check if any page in a range is under writeback
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
*
* This is currently done by walking the list of writepage requests
* for the inode, which can be pretty inefficient.
*/
static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
pgoff_t idx_to)
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
{
struct fuse_inode *fi = get_fuse_inode(inode);
bool found;
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
spin_lock(&fi->lock);
found = fuse_find_writeback(fi, idx_from, idx_to);
spin_unlock(&fi->lock);
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
return found;
}
static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
{
return fuse_range_is_writeback(inode, index, index);
}
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
/*
* Wait for page writeback to be completed.
*
* Since fuse doesn't rely on the VM writeback tracking, this has to
* use some other means.
*/
static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
{
struct fuse_inode *fi = get_fuse_inode(inode);
wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
}
/*
* Wait for all pending writepages on the inode to finish.
*
* This is currently done by blocking further writes with FUSE_NOWRITE
* and waiting for all sent writes to complete.
*
* This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
* could conflict with truncation.
*/
static void fuse_sync_writes(struct inode *inode)
{
fuse_set_nowrite(inode);
fuse_release_nowrite(inode);
}
static int fuse_flush(struct file *file, fl_owner_t id)
{
struct inode *inode = file_inode(file);
struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_file *ff = file->private_data;
struct fuse_flush_in inarg;
FUSE_ARGS(args);
int err;
if (fuse_is_bad(inode))
return -EIO;
if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache)
return 0;
err = write_inode_now(inode, 1);
if (err)
return err;
inode_lock(inode);
fuse_sync_writes(inode);
inode_unlock(inode);
err = filemap_check_errors(file->f_mapping);
if (err)
return err;
err = 0;
if (fm->fc->no_flush)
goto inval_attr_out;
memset(&inarg, 0, sizeof(inarg));
inarg.fh = ff->fh;
inarg.lock_owner = fuse_lock_owner_id(fm->fc, id);
args.opcode = FUSE_FLUSH;
args.nodeid = get_node_id(inode);
args.in_numargs = 1;
args.in_args[0].size = sizeof(inarg);
args.in_args[0].value = &inarg;
args.force = true;
err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
fm->fc->no_flush = 1;
err = 0;
}
inval_attr_out:
/*
* In memory i_blocks is not maintained by fuse, if writeback cache is
* enabled, i_blocks from cached attr may not be accurate.
*/
if (!err && fm->fc->writeback_cache)
fuse_invalidate_attr_mask(inode, STATX_BLOCKS);
return err;
}
int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
int datasync, int opcode)
{
struct inode *inode = file->f_mapping->host;
struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_file *ff = file->private_data;
FUSE_ARGS(args);
struct fuse_fsync_in inarg;
memset(&inarg, 0, sizeof(inarg));
inarg.fh = ff->fh;
inarg.fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0;
args.opcode = opcode;
args.nodeid = get_node_id(inode);
args.in_numargs = 1;
args.in_args[0].size = sizeof(inarg);
args.in_args[0].value = &inarg;
return fuse_simple_request(fm, &args);
}
static int fuse_fsync(struct file *file, loff_t start, loff_t end,
int datasync)
{
struct inode *inode = file->f_mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
int err;
if (fuse_is_bad(inode))
return -EIO;
inode_lock(inode);
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
/*
* Start writeback against all dirty pages of the inode, then
* wait for all outstanding writes, before sending the FSYNC
* request.
*/
err = file_write_and_wait_range(file, start, end);
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
if (err)
goto out;
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
fuse_sync_writes(inode);
/*
* Due to implementation of fuse writeback
* file_write_and_wait_range() does not catch errors.
* We have to do this directly after fuse_sync_writes()
*/
err = file_check_and_advance_wb_err(file);
if (err)
goto out;
err = sync_inode_metadata(inode, 1);
if (err)
goto out;
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
if (fc->no_fsync)
goto out;
err = fuse_fsync_common(file, start, end, datasync, FUSE_FSYNC);
if (err == -ENOSYS) {
fc->no_fsync = 1;
err = 0;
}
out:
inode_unlock(inode);
return err;
}
void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
size_t count, int opcode)
{
struct fuse_file *ff = file->private_data;
struct fuse_args *args = &ia->ap.args;
ia->read.in.fh = ff->fh;
ia->read.in.offset = pos;
ia->read.in.size = count;
ia->read.in.flags = file->f_flags;
args->opcode = opcode;
args->nodeid = ff->nodeid;
args->in_numargs = 1;
args->in_args[0].size = sizeof(ia->read.in);
args->in_args[0].value = &ia->read.in;
args->out_argvar = true;
args->out_numargs = 1;
args->out_args[0].size = count;
}
static void fuse_release_user_pages(struct fuse_args_pages *ap,
bool should_dirty)
{
unsigned int i;
for (i = 0; i < ap->num_pages; i++) {
if (should_dirty)
set_page_dirty_lock(ap->pages[i]);
put_page(ap->pages[i]);
}
}
static void fuse_io_release(struct kref *kref)
{
kfree(container_of(kref, struct fuse_io_priv, refcnt));
}
static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
{
if (io->err)
return io->err;
if (io->bytes >= 0 && io->write)
return -EIO;
return io->bytes < 0 ? io->size : io->bytes;
}
/*
* In case of short read, the caller sets 'pos' to the position of
* actual end of fuse request in IO request. Otherwise, if bytes_requested
* == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1.
*
* An example:
* User requested DIO read of 64K. It was split into two 32K fuse requests,
* both submitted asynchronously. The first of them was ACKed by userspace as
* fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The
* second request was ACKed as short, e.g. only 1K was read, resulting in
* pos == 33K.
*
* Thus, when all fuse requests are completed, the minimal non-negative 'pos'
* will be equal to the length of the longest contiguous fragment of
* transferred data starting from the beginning of IO request.
*/
static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
{
int left;
spin_lock(&io->lock);
if (err)
io->err = io->err ? : err;
else if (pos >= 0 && (io->bytes < 0 || pos < io->bytes))
io->bytes = pos;
left = --io->reqs;
if (!left && io->blocking)
complete(io->done);
spin_unlock(&io->lock);
if (!left && !io->blocking) {
ssize_t res = fuse_get_res_by_io(io);
if (res >= 0) {
struct inode *inode = file_inode(io->iocb->ki_filp);
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
spin_lock(&fi->lock);
fi->attr_version = atomic64_inc_return(&fc->attr_version);
spin_unlock(&fi->lock);
}
io->iocb->ki_complete(io->iocb, res);
}
kref_put(&io->refcnt, fuse_io_release);
}
static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io,
unsigned int npages)
{
struct fuse_io_args *ia;
ia = kzalloc(sizeof(*ia), GFP_KERNEL);
if (ia) {
ia->io = io;
ia->ap.pages = fuse_pages_alloc(npages, GFP_KERNEL,
&ia->ap.descs);
if (!ia->ap.pages) {
kfree(ia);
ia = NULL;
}
}
return ia;
}
static void fuse_io_free(struct fuse_io_args *ia)
{
kfree(ia->ap.pages);
kfree(ia);
}
static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args,
int err)
{
struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
struct fuse_io_priv *io = ia->io;
ssize_t pos = -1;
fuse_release_user_pages(&ia->ap, io->should_dirty);
if (err) {
/* Nothing */
} else if (io->write) {
if (ia->write.out.size > ia->write.in.size) {
err = -EIO;
} else if (ia->write.in.size != ia->write.out.size) {
pos = ia->write.in.offset - io->offset +
ia->write.out.size;
}
} else {
u32 outsize = args->out_args[0].size;
if (ia->read.in.size != outsize)
pos = ia->read.in.offset - io->offset + outsize;
}
fuse_aio_complete(io, err, pos);
fuse_io_free(ia);
}
static ssize_t fuse_async_req_send(struct fuse_mount *fm,
struct fuse_io_args *ia, size_t num_bytes)
{
ssize_t err;
struct fuse_io_priv *io = ia->io;
spin_lock(&io->lock);
kref_get(&io->refcnt);
io->size += num_bytes;
io->reqs++;
spin_unlock(&io->lock);
ia->ap.args.end = fuse_aio_complete_req;
ia->ap.args.may_block = io->should_dirty;
err = fuse_simple_background(fm, &ia->ap.args, GFP_KERNEL);
if (err)
fuse_aio_complete_req(fm, &ia->ap.args, err);
return num_bytes;
}
static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count,
fl_owner_t owner)
{
struct file *file = ia->io->iocb->ki_filp;
struct fuse_file *ff = file->private_data;
struct fuse_mount *fm = ff->fm;
fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
if (owner != NULL) {
ia->read.in.read_flags |= FUSE_READ_LOCKOWNER;
ia->read.in.lock_owner = fuse_lock_owner_id(fm->fc, owner);
}
if (ia->io->async)
return fuse_async_req_send(fm, ia, count);
return fuse_simple_request(fm, &ia->ap.args);
}
static void fuse_read_update_size(struct inode *inode, loff_t size,
u64 attr_ver)
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
spin_lock(&fi->lock);
if (attr_ver >= fi->attr_version && size < inode->i_size &&
fuse: hotfix truncate_pagecache() issue The way how fuse calls truncate_pagecache() from fuse_change_attributes() is completely wrong. Because, w/o i_mutex held, we never sure whether 'oldsize' and 'attr->size' are valid by the time of execution of truncate_pagecache(inode, oldsize, attr->size). In fact, as soon as we released fc->lock in the middle of fuse_change_attributes(), we completely loose control of actions which may happen with given inode until we reach truncate_pagecache. The list of potentially dangerous actions includes mmap-ed reads and writes, ftruncate(2) and write(2) extending file size. The typical outcome of doing truncate_pagecache() with outdated arguments is data corruption from user point of view. This is (in some sense) acceptable in cases when the issue is triggered by a change of the file on the server (i.e. externally wrt fuse operation), but it is absolutely intolerable in scenarios when a single fuse client modifies a file without any external intervention. A real life case I discovered by fsx-linux looked like this: 1. Shrinking ftruncate(2) comes to fuse_do_setattr(). The latter sends FUSE_SETATTR to the server synchronously, but before getting fc->lock ... 2. fuse_dentry_revalidate() is asynchronously called. It sends FUSE_LOOKUP to the server synchronously, then calls fuse_change_attributes(). The latter updates i_size, releases fc->lock, but before comparing oldsize vs attr->size.. 3. fuse_do_setattr() from the first step proceeds by acquiring fc->lock and updating attributes and i_size, but now oldsize is equal to outarg.attr.size because i_size has just been updated (step 2). Hence, fuse_do_setattr() returns w/o calling truncate_pagecache(). 4. As soon as ftruncate(2) completes, the user extends file size by write(2) making a hole in the middle of file, then reads data from the hole either by read(2) or mmap-ed read. The user expects to get zero data from the hole, but gets stale data because truncate_pagecache() is not executed yet. The scenario above illustrates one side of the problem: not truncating the page cache even though we should. Another side corresponds to truncating page cache too late, when the state of inode changed significantly. Theoretically, the following is possible: 1. As in the previous scenario fuse_dentry_revalidate() discovered that i_size changed (due to our own fuse_do_setattr()) and is going to call truncate_pagecache() for some 'new_size' it believes valid right now. But by the time that particular truncate_pagecache() is called ... 2. fuse_do_setattr() returns (either having called truncate_pagecache() or not -- it doesn't matter). 3. The file is extended either by write(2) or ftruncate(2) or fallocate(2). 4. mmap-ed write makes a page in the extended region dirty. The result will be the lost of data user wrote on the fourth step. The patch is a hotfix resolving the issue in a simplistic way: let's skip dangerous i_size update and truncate_pagecache if an operation changing file size is in progress. This simplistic approach looks correct for the cases w/o external changes. And to handle them properly, more sophisticated and intrusive techniques (e.g. NFS-like one) would be required. I'd like to postpone it until the issue is well discussed on the mailing list(s). Changed in v2: - improved patch description to cover both sides of the issue. Signed-off-by: Maxim Patlasov <mpatlasov@parallels.com> Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: stable@vger.kernel.org
2013-08-30 21:06:04 +08:00
!test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
fi->attr_version = atomic64_inc_return(&fc->attr_version);
i_size_write(inode, size);
}
spin_unlock(&fi->lock);
}
static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read,
struct fuse_args_pages *ap)
{
struct fuse_conn *fc = get_fuse_conn(inode);
/*
* If writeback_cache is enabled, a short read means there's a hole in
* the file. Some data after the hole is in page cache, but has not
* reached the client fs yet. So the hole is not present there.
*/
if (!fc->writeback_cache) {
loff_t pos = page_offset(ap->pages[0]) + num_read;
fuse_read_update_size(inode, pos, attr_ver);
}
}
static int fuse_do_readpage(struct file *file, struct page *page)
{
struct inode *inode = page->mapping->host;
struct fuse_mount *fm = get_fuse_mount(inode);
loff_t pos = page_offset(page);
struct fuse_page_desc desc = { .length = PAGE_SIZE };
struct fuse_io_args ia = {
.ap.args.page_zeroing = true,
.ap.args.out_pages = true,
.ap.num_pages = 1,
.ap.pages = &page,
.ap.descs = &desc,
};
ssize_t res;
u64 attr_ver;
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
/*
* Page writeback can extend beyond the lifetime of the
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
* page-cache page, so make sure we read a properly synced
* page.
*/
fuse_wait_on_page_writeback(inode, page->index);
attr_ver = fuse_get_attr_version(fm->fc);
/* Don't overflow end offset */
if (pos + (desc.length - 1) == LLONG_MAX)
desc.length--;
fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ);
res = fuse_simple_request(fm, &ia.ap.args);
if (res < 0)
return res;
/*
* Short read means EOF. If file size is larger, truncate it
*/
if (res < desc.length)
fuse_short_read(inode, attr_ver, res, &ia.ap);
SetPageUptodate(page);
return 0;
}
static int fuse_read_folio(struct file *file, struct folio *folio)
{
struct page *page = &folio->page;
struct inode *inode = page->mapping->host;
int err;
err = -EIO;
if (fuse_is_bad(inode))
goto out;
err = fuse_do_readpage(file, page);
fuse_invalidate_atime(inode);
out:
unlock_page(page);
return err;
}
static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
int err)
{
int i;
struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
struct fuse_args_pages *ap = &ia->ap;
size_t count = ia->read.in.size;
size_t num_read = args->out_args[0].size;
struct address_space *mapping = NULL;
for (i = 0; mapping == NULL && i < ap->num_pages; i++)
mapping = ap->pages[i]->mapping;
if (mapping) {
struct inode *inode = mapping->host;
/*
* Short read means EOF. If file size is larger, truncate it
*/
if (!err && num_read < count)
fuse_short_read(inode, ia->read.attr_ver, num_read, ap);
fuse_invalidate_atime(inode);
}
for (i = 0; i < ap->num_pages; i++) {
struct page *page = ap->pages[i];
if (!err)
SetPageUptodate(page);
else
SetPageError(page);
unlock_page(page);
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
put_page(page);
}
if (ia->ff)
fuse_file_put(ia->ff, false, false);
fuse_io_free(ia);
}
static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
{
struct fuse_file *ff = file->private_data;
struct fuse_mount *fm = ff->fm;
struct fuse_args_pages *ap = &ia->ap;
loff_t pos = page_offset(ap->pages[0]);
size_t count = ap->num_pages << PAGE_SHIFT;
ssize_t res;
int err;
ap->args.out_pages = true;
ap->args.page_zeroing = true;
ap->args.page_replace = true;
/* Don't overflow end offset */
if (pos + (count - 1) == LLONG_MAX) {
count--;
ap->descs[ap->num_pages - 1].length--;
}
WARN_ON((loff_t) (pos + count) < 0);
fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
ia->read.attr_ver = fuse_get_attr_version(fm->fc);
if (fm->fc->async_read) {
ia->ff = fuse_file_get(ff);
ap->args.end = fuse_readpages_end;
err = fuse_simple_background(fm, &ap->args, GFP_KERNEL);
if (!err)
return;
} else {
res = fuse_simple_request(fm, &ap->args);
err = res < 0 ? res : 0;
}
fuse_readpages_end(fm, &ap->args, err);
}
static void fuse_readahead(struct readahead_control *rac)
{
struct inode *inode = rac->mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
unsigned int i, max_pages, nr_pages = 0;
if (fuse_is_bad(inode))
return;
max_pages = min_t(unsigned int, fc->max_pages,
fc->max_read / PAGE_SIZE);
for (;;) {
struct fuse_io_args *ia;
struct fuse_args_pages *ap;
fuse: remove reliance on bdi congestion The bdi congestion tracking in not widely used and will be removed. Fuse is one of a small number of filesystems that uses it, setting both the sync (read) and async (write) congestion flags at what it determines are appropriate times. The only remaining effect of the sync flag is to cause read-ahead to be skipped. The only remaining effect of the async flag is to cause (some) WB_SYNC_NONE writes to be skipped. So instead of setting the flags, change: - .readahead to stop when it has submitted all non-async pages for read. - .writepages to do nothing if WB_SYNC_NONE and the flag would be set - .writepage to return AOP_WRITEPAGE_ACTIVATE if WB_SYNC_NONE and the flag would be set. The writepages change causes a behavioural change in that pageout() can now return PAGE_ACTIVATE instead of PAGE_KEEP, so SetPageActive() will be called on the page which (I think) will further delay the next attempt at writeout. This might be a good thing. Link: https://lkml.kernel.org/r/164549983737.9187.2627117501000365074.stgit@noble.brown Signed-off-by: NeilBrown <neilb@suse.de> Cc: Anna Schumaker <Anna.Schumaker@Netapp.com> Cc: Chao Yu <chao@kernel.org> Cc: Darrick J. Wong <djwong@kernel.org> Cc: Ilya Dryomov <idryomov@gmail.com> Cc: Jaegeuk Kim <jaegeuk@kernel.org> Cc: Jan Kara <jack@suse.cz> Cc: Jeff Layton <jlayton@kernel.org> Cc: Jens Axboe <axboe@kernel.dk> Cc: Lars Ellenberg <lars.ellenberg@linbit.com> Cc: Miklos Szeredi <miklos@szeredi.hu> Cc: Paolo Valente <paolo.valente@linaro.org> Cc: Philipp Reisner <philipp.reisner@linbit.com> Cc: Ryusuke Konishi <konishi.ryusuke@gmail.com> Cc: Trond Myklebust <trond.myklebust@hammerspace.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-03-23 05:38:58 +08:00
if (fc->num_background >= fc->congestion_threshold &&
rac->ra->async_size >= readahead_count(rac))
/*
* Congested and only async pages left, so skip the
* rest.
*/
break;
nr_pages = readahead_count(rac) - nr_pages;
if (nr_pages > max_pages)
nr_pages = max_pages;
if (nr_pages == 0)
break;
ia = fuse_io_alloc(NULL, nr_pages);
if (!ia)
return;
ap = &ia->ap;
nr_pages = __readahead_batch(rac, ap->pages, nr_pages);
for (i = 0; i < nr_pages; i++) {
fuse_wait_on_page_writeback(inode,
readahead_index(rac) + i);
ap->descs[i].length = PAGE_SIZE;
}
ap->num_pages = nr_pages;
fuse_send_readpages(ia, rac->file);
}
}
static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
/*
* In auto invalidate mode, always update attributes on read.
* Otherwise, only update if we attempt to read past EOF (to ensure
* i_size is up to date).
*/
if (fc->auto_inval_data ||
(iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) {
int err;
err = fuse_update_attributes(inode, iocb->ki_filp, STATX_SIZE);
if (err)
return err;
}
return generic_file_read_iter(iocb, to);
}
static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff,
loff_t pos, size_t count)
{
struct fuse_args *args = &ia->ap.args;
ia->write.in.fh = ff->fh;
ia->write.in.offset = pos;
ia->write.in.size = count;
args->opcode = FUSE_WRITE;
args->nodeid = ff->nodeid;
args->in_numargs = 2;
if (ff->fm->fc->minor < 9)
args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
else
args->in_args[0].size = sizeof(ia->write.in);
args->in_args[0].value = &ia->write.in;
args->in_args[1].size = count;
args->out_numargs = 1;
args->out_args[0].size = sizeof(ia->write.out);
args->out_args[0].value = &ia->write.out;
}
static unsigned int fuse_write_flags(struct kiocb *iocb)
{
unsigned int flags = iocb->ki_filp->f_flags;
if (iocb_is_dsync(iocb))
flags |= O_DSYNC;
if (iocb->ki_flags & IOCB_SYNC)
flags |= O_SYNC;
return flags;
}
static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos,
size_t count, fl_owner_t owner)
{
struct kiocb *iocb = ia->io->iocb;
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
struct fuse_mount *fm = ff->fm;
struct fuse_write_in *inarg = &ia->write.in;
ssize_t err;
fuse_write_args_fill(ia, ff, pos, count);
inarg->flags = fuse_write_flags(iocb);
if (owner != NULL) {
inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
inarg->lock_owner = fuse_lock_owner_id(fm->fc, owner);
}
if (ia->io->async)
return fuse_async_req_send(fm, ia, count);
err = fuse_simple_request(fm, &ia->ap.args);
if (!err && ia->write.out.size > count)
err = -EIO;
return err ?: ia->write.out.size;
}
bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written)
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
bool ret = false;
spin_lock(&fi->lock);
fi->attr_version = atomic64_inc_return(&fc->attr_version);
if (written > 0 && pos > inode->i_size) {
i_size_write(inode, pos);
ret = true;
}
spin_unlock(&fi->lock);
fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
return ret;
}
static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
struct kiocb *iocb, struct inode *inode,
loff_t pos, size_t count)
{
struct fuse_args_pages *ap = &ia->ap;
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
struct fuse_mount *fm = ff->fm;
unsigned int offset, i;
fuse: fix write deadlock There are two modes for write(2) and friends in fuse: a) write through (update page cache, send sync WRITE request to userspace) b) buffered write (update page cache, async writeout later) The write through method kept all the page cache pages locked that were used for the request. Keeping more than one page locked is deadlock prone and Qian Cai demonstrated this with trinity fuzzing. The reason for keeping the pages locked is that concurrent mapped reads shouldn't try to pull possibly stale data into the page cache. For full page writes, the easy way to fix this is to make the cached page be the authoritative source by marking the page PG_uptodate immediately. After this the page can be safely unlocked, since mapped/cached reads will take the written data from the cache. Concurrent mapped writes will now cause data in the original WRITE request to be updated; this however doesn't cause any data inconsistency and this scenario should be exceedingly rare anyway. If the WRITE request returns with an error in the above case, currently the page is not marked uptodate; this means that a concurrent read will always read consistent data. After this patch the page is uptodate between writing to the cache and receiving the error: there's window where a cached read will read the wrong data. While theoretically this could be a regression, it is unlikely to be one in practice, since this is normal for buffered writes. In case of a partial page write to an already uptodate page the locking is also unnecessary, with the above caveats. Partial write of a not uptodate page still needs to be handled. One way would be to read the complete page before doing the write. This is not possible, since it might break filesystems that don't expect any READ requests when the file was opened O_WRONLY. The other solution is to serialize the synchronous write with reads from the partial pages. The easiest way to do this is to keep the partial pages locked. The problem is that a write() may involve two such pages (one head and one tail). This patch fixes it by only locking the partial tail page. If there's a partial head page as well, then split that off as a separate WRITE request. Reported-by: Qian Cai <cai@lca.pw> Link: https://lore.kernel.org/linux-fsdevel/4794a3fa3742a5e84fb0f934944204b55730829b.camel@lca.pw/ Fixes: ea9b9907b82a ("fuse: implement perform_write") Cc: <stable@vger.kernel.org> # v2.6.26 Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-10-22 04:12:49 +08:00
bool short_write;
int err;
for (i = 0; i < ap->num_pages; i++)
fuse_wait_on_page_writeback(inode, ap->pages[i]->index);
fuse_write_args_fill(ia, ff, pos, count);
ia->write.in.flags = fuse_write_flags(iocb);
if (fm->fc->handle_killpriv_v2 && !capable(CAP_FSETID))
ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID;
err = fuse_simple_request(fm, &ap->args);
if (!err && ia->write.out.size > count)
err = -EIO;
fuse: fix write deadlock There are two modes for write(2) and friends in fuse: a) write through (update page cache, send sync WRITE request to userspace) b) buffered write (update page cache, async writeout later) The write through method kept all the page cache pages locked that were used for the request. Keeping more than one page locked is deadlock prone and Qian Cai demonstrated this with trinity fuzzing. The reason for keeping the pages locked is that concurrent mapped reads shouldn't try to pull possibly stale data into the page cache. For full page writes, the easy way to fix this is to make the cached page be the authoritative source by marking the page PG_uptodate immediately. After this the page can be safely unlocked, since mapped/cached reads will take the written data from the cache. Concurrent mapped writes will now cause data in the original WRITE request to be updated; this however doesn't cause any data inconsistency and this scenario should be exceedingly rare anyway. If the WRITE request returns with an error in the above case, currently the page is not marked uptodate; this means that a concurrent read will always read consistent data. After this patch the page is uptodate between writing to the cache and receiving the error: there's window where a cached read will read the wrong data. While theoretically this could be a regression, it is unlikely to be one in practice, since this is normal for buffered writes. In case of a partial page write to an already uptodate page the locking is also unnecessary, with the above caveats. Partial write of a not uptodate page still needs to be handled. One way would be to read the complete page before doing the write. This is not possible, since it might break filesystems that don't expect any READ requests when the file was opened O_WRONLY. The other solution is to serialize the synchronous write with reads from the partial pages. The easiest way to do this is to keep the partial pages locked. The problem is that a write() may involve two such pages (one head and one tail). This patch fixes it by only locking the partial tail page. If there's a partial head page as well, then split that off as a separate WRITE request. Reported-by: Qian Cai <cai@lca.pw> Link: https://lore.kernel.org/linux-fsdevel/4794a3fa3742a5e84fb0f934944204b55730829b.camel@lca.pw/ Fixes: ea9b9907b82a ("fuse: implement perform_write") Cc: <stable@vger.kernel.org> # v2.6.26 Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-10-22 04:12:49 +08:00
short_write = ia->write.out.size < count;
offset = ap->descs[0].offset;
count = ia->write.out.size;
for (i = 0; i < ap->num_pages; i++) {
struct page *page = ap->pages[i];
fuse: fix write deadlock There are two modes for write(2) and friends in fuse: a) write through (update page cache, send sync WRITE request to userspace) b) buffered write (update page cache, async writeout later) The write through method kept all the page cache pages locked that were used for the request. Keeping more than one page locked is deadlock prone and Qian Cai demonstrated this with trinity fuzzing. The reason for keeping the pages locked is that concurrent mapped reads shouldn't try to pull possibly stale data into the page cache. For full page writes, the easy way to fix this is to make the cached page be the authoritative source by marking the page PG_uptodate immediately. After this the page can be safely unlocked, since mapped/cached reads will take the written data from the cache. Concurrent mapped writes will now cause data in the original WRITE request to be updated; this however doesn't cause any data inconsistency and this scenario should be exceedingly rare anyway. If the WRITE request returns with an error in the above case, currently the page is not marked uptodate; this means that a concurrent read will always read consistent data. After this patch the page is uptodate between writing to the cache and receiving the error: there's window where a cached read will read the wrong data. While theoretically this could be a regression, it is unlikely to be one in practice, since this is normal for buffered writes. In case of a partial page write to an already uptodate page the locking is also unnecessary, with the above caveats. Partial write of a not uptodate page still needs to be handled. One way would be to read the complete page before doing the write. This is not possible, since it might break filesystems that don't expect any READ requests when the file was opened O_WRONLY. The other solution is to serialize the synchronous write with reads from the partial pages. The easiest way to do this is to keep the partial pages locked. The problem is that a write() may involve two such pages (one head and one tail). This patch fixes it by only locking the partial tail page. If there's a partial head page as well, then split that off as a separate WRITE request. Reported-by: Qian Cai <cai@lca.pw> Link: https://lore.kernel.org/linux-fsdevel/4794a3fa3742a5e84fb0f934944204b55730829b.camel@lca.pw/ Fixes: ea9b9907b82a ("fuse: implement perform_write") Cc: <stable@vger.kernel.org> # v2.6.26 Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-10-22 04:12:49 +08:00
if (err) {
ClearPageUptodate(page);
} else {
if (count >= PAGE_SIZE - offset)
count -= PAGE_SIZE - offset;
else {
if (short_write)
ClearPageUptodate(page);
count = 0;
}
offset = 0;
}
if (ia->write.page_locked && (i == ap->num_pages - 1))
unlock_page(page);
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
put_page(page);
}
return err;
}
fuse: fix write deadlock There are two modes for write(2) and friends in fuse: a) write through (update page cache, send sync WRITE request to userspace) b) buffered write (update page cache, async writeout later) The write through method kept all the page cache pages locked that were used for the request. Keeping more than one page locked is deadlock prone and Qian Cai demonstrated this with trinity fuzzing. The reason for keeping the pages locked is that concurrent mapped reads shouldn't try to pull possibly stale data into the page cache. For full page writes, the easy way to fix this is to make the cached page be the authoritative source by marking the page PG_uptodate immediately. After this the page can be safely unlocked, since mapped/cached reads will take the written data from the cache. Concurrent mapped writes will now cause data in the original WRITE request to be updated; this however doesn't cause any data inconsistency and this scenario should be exceedingly rare anyway. If the WRITE request returns with an error in the above case, currently the page is not marked uptodate; this means that a concurrent read will always read consistent data. After this patch the page is uptodate between writing to the cache and receiving the error: there's window where a cached read will read the wrong data. While theoretically this could be a regression, it is unlikely to be one in practice, since this is normal for buffered writes. In case of a partial page write to an already uptodate page the locking is also unnecessary, with the above caveats. Partial write of a not uptodate page still needs to be handled. One way would be to read the complete page before doing the write. This is not possible, since it might break filesystems that don't expect any READ requests when the file was opened O_WRONLY. The other solution is to serialize the synchronous write with reads from the partial pages. The easiest way to do this is to keep the partial pages locked. The problem is that a write() may involve two such pages (one head and one tail). This patch fixes it by only locking the partial tail page. If there's a partial head page as well, then split that off as a separate WRITE request. Reported-by: Qian Cai <cai@lca.pw> Link: https://lore.kernel.org/linux-fsdevel/4794a3fa3742a5e84fb0f934944204b55730829b.camel@lca.pw/ Fixes: ea9b9907b82a ("fuse: implement perform_write") Cc: <stable@vger.kernel.org> # v2.6.26 Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-10-22 04:12:49 +08:00
static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
struct address_space *mapping,
struct iov_iter *ii, loff_t pos,
unsigned int max_pages)
{
fuse: fix write deadlock There are two modes for write(2) and friends in fuse: a) write through (update page cache, send sync WRITE request to userspace) b) buffered write (update page cache, async writeout later) The write through method kept all the page cache pages locked that were used for the request. Keeping more than one page locked is deadlock prone and Qian Cai demonstrated this with trinity fuzzing. The reason for keeping the pages locked is that concurrent mapped reads shouldn't try to pull possibly stale data into the page cache. For full page writes, the easy way to fix this is to make the cached page be the authoritative source by marking the page PG_uptodate immediately. After this the page can be safely unlocked, since mapped/cached reads will take the written data from the cache. Concurrent mapped writes will now cause data in the original WRITE request to be updated; this however doesn't cause any data inconsistency and this scenario should be exceedingly rare anyway. If the WRITE request returns with an error in the above case, currently the page is not marked uptodate; this means that a concurrent read will always read consistent data. After this patch the page is uptodate between writing to the cache and receiving the error: there's window where a cached read will read the wrong data. While theoretically this could be a regression, it is unlikely to be one in practice, since this is normal for buffered writes. In case of a partial page write to an already uptodate page the locking is also unnecessary, with the above caveats. Partial write of a not uptodate page still needs to be handled. One way would be to read the complete page before doing the write. This is not possible, since it might break filesystems that don't expect any READ requests when the file was opened O_WRONLY. The other solution is to serialize the synchronous write with reads from the partial pages. The easiest way to do this is to keep the partial pages locked. The problem is that a write() may involve two such pages (one head and one tail). This patch fixes it by only locking the partial tail page. If there's a partial head page as well, then split that off as a separate WRITE request. Reported-by: Qian Cai <cai@lca.pw> Link: https://lore.kernel.org/linux-fsdevel/4794a3fa3742a5e84fb0f934944204b55730829b.camel@lca.pw/ Fixes: ea9b9907b82a ("fuse: implement perform_write") Cc: <stable@vger.kernel.org> # v2.6.26 Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-10-22 04:12:49 +08:00
struct fuse_args_pages *ap = &ia->ap;
struct fuse_conn *fc = get_fuse_conn(mapping->host);
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
unsigned offset = pos & (PAGE_SIZE - 1);
size_t count = 0;
int err;
ap->args.in_pages = true;
ap->descs[0].offset = offset;
do {
size_t tmp;
struct page *page;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
pgoff_t index = pos >> PAGE_SHIFT;
size_t bytes = min_t(size_t, PAGE_SIZE - offset,
iov_iter_count(ii));
bytes = min_t(size_t, bytes, fc->max_write - count);
again:
err = -EFAULT;
if (fault_in_iov_iter_readable(ii, bytes))
break;
err = -ENOMEM;
page = grab_cache_page_write_begin(mapping, index);
if (!page)
break;
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
tmp = copy_page_from_iter_atomic(page, offset, bytes, ii);
flush_dcache_page(page);
if (!tmp) {
unlock_page(page);
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
put_page(page);
goto again;
}
err = 0;
ap->pages[ap->num_pages] = page;
ap->descs[ap->num_pages].length = tmp;
ap->num_pages++;
count += tmp;
pos += tmp;
offset += tmp;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
if (offset == PAGE_SIZE)
offset = 0;
fuse: fix write deadlock There are two modes for write(2) and friends in fuse: a) write through (update page cache, send sync WRITE request to userspace) b) buffered write (update page cache, async writeout later) The write through method kept all the page cache pages locked that were used for the request. Keeping more than one page locked is deadlock prone and Qian Cai demonstrated this with trinity fuzzing. The reason for keeping the pages locked is that concurrent mapped reads shouldn't try to pull possibly stale data into the page cache. For full page writes, the easy way to fix this is to make the cached page be the authoritative source by marking the page PG_uptodate immediately. After this the page can be safely unlocked, since mapped/cached reads will take the written data from the cache. Concurrent mapped writes will now cause data in the original WRITE request to be updated; this however doesn't cause any data inconsistency and this scenario should be exceedingly rare anyway. If the WRITE request returns with an error in the above case, currently the page is not marked uptodate; this means that a concurrent read will always read consistent data. After this patch the page is uptodate between writing to the cache and receiving the error: there's window where a cached read will read the wrong data. While theoretically this could be a regression, it is unlikely to be one in practice, since this is normal for buffered writes. In case of a partial page write to an already uptodate page the locking is also unnecessary, with the above caveats. Partial write of a not uptodate page still needs to be handled. One way would be to read the complete page before doing the write. This is not possible, since it might break filesystems that don't expect any READ requests when the file was opened O_WRONLY. The other solution is to serialize the synchronous write with reads from the partial pages. The easiest way to do this is to keep the partial pages locked. The problem is that a write() may involve two such pages (one head and one tail). This patch fixes it by only locking the partial tail page. If there's a partial head page as well, then split that off as a separate WRITE request. Reported-by: Qian Cai <cai@lca.pw> Link: https://lore.kernel.org/linux-fsdevel/4794a3fa3742a5e84fb0f934944204b55730829b.camel@lca.pw/ Fixes: ea9b9907b82a ("fuse: implement perform_write") Cc: <stable@vger.kernel.org> # v2.6.26 Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-10-22 04:12:49 +08:00
/* If we copied full page, mark it uptodate */
if (tmp == PAGE_SIZE)
SetPageUptodate(page);
if (PageUptodate(page)) {
unlock_page(page);
} else {
ia->write.page_locked = true;
break;
}
if (!fc->big_writes)
break;
} while (iov_iter_count(ii) && count < fc->max_write &&
ap->num_pages < max_pages && offset == 0);
return count > 0 ? count : err;
}
fuse: add max_pages to init_out Replace FUSE_MAX_PAGES_PER_REQ with the configurable parameter max_pages to improve performance. Old RFC with detailed description of the problem and many fixes by Mitsuo Hayasaka (mitsuo.hayasaka.hu@hitachi.com): - https://lkml.org/lkml/2012/7/5/136 We've encountered performance degradation and fixed it on a big and complex virtual environment. Environment to reproduce degradation and improvement: 1. Add lag to user mode FUSE Add nanosleep(&(struct timespec){ 0, 1000 }, NULL); to xmp_write_buf in passthrough_fh.c 2. patch UM fuse with configurable max_pages parameter. The patch will be provided latter. 3. run test script and perform test on tmpfs fuse_test() { cd /tmp mkdir -p fusemnt passthrough_fh -o max_pages=$1 /tmp/fusemnt grep fuse /proc/self/mounts dd conv=fdatasync oflag=dsync if=/dev/zero of=fusemnt/tmp/tmp \ count=1K bs=1M 2>&1 | grep -v records rm fusemnt/tmp/tmp killall passthrough_fh } Test results: passthrough_fh /tmp/fusemnt fuse.passthrough_fh \ rw,nosuid,nodev,relatime,user_id=0,group_id=0 0 0 1073741824 bytes (1.1 GB) copied, 1.73867 s, 618 MB/s passthrough_fh /tmp/fusemnt fuse.passthrough_fh \ rw,nosuid,nodev,relatime,user_id=0,group_id=0,max_pages=256 0 0 1073741824 bytes (1.1 GB) copied, 1.15643 s, 928 MB/s Obviously with bigger lag the difference between 'before' and 'after' will be more significant. Mitsuo Hayasaka, in 2012 (https://lkml.org/lkml/2012/7/5/136), observed improvement from 400-550 to 520-740. Signed-off-by: Constantine Shulyupin <const@MakeLinux.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2018-09-06 20:37:06 +08:00
static inline unsigned int fuse_wr_pages(loff_t pos, size_t len,
unsigned int max_pages)
{
fuse: add max_pages to init_out Replace FUSE_MAX_PAGES_PER_REQ with the configurable parameter max_pages to improve performance. Old RFC with detailed description of the problem and many fixes by Mitsuo Hayasaka (mitsuo.hayasaka.hu@hitachi.com): - https://lkml.org/lkml/2012/7/5/136 We've encountered performance degradation and fixed it on a big and complex virtual environment. Environment to reproduce degradation and improvement: 1. Add lag to user mode FUSE Add nanosleep(&(struct timespec){ 0, 1000 }, NULL); to xmp_write_buf in passthrough_fh.c 2. patch UM fuse with configurable max_pages parameter. The patch will be provided latter. 3. run test script and perform test on tmpfs fuse_test() { cd /tmp mkdir -p fusemnt passthrough_fh -o max_pages=$1 /tmp/fusemnt grep fuse /proc/self/mounts dd conv=fdatasync oflag=dsync if=/dev/zero of=fusemnt/tmp/tmp \ count=1K bs=1M 2>&1 | grep -v records rm fusemnt/tmp/tmp killall passthrough_fh } Test results: passthrough_fh /tmp/fusemnt fuse.passthrough_fh \ rw,nosuid,nodev,relatime,user_id=0,group_id=0 0 0 1073741824 bytes (1.1 GB) copied, 1.73867 s, 618 MB/s passthrough_fh /tmp/fusemnt fuse.passthrough_fh \ rw,nosuid,nodev,relatime,user_id=0,group_id=0,max_pages=256 0 0 1073741824 bytes (1.1 GB) copied, 1.15643 s, 928 MB/s Obviously with bigger lag the difference between 'before' and 'after' will be more significant. Mitsuo Hayasaka, in 2012 (https://lkml.org/lkml/2012/7/5/136), observed improvement from 400-550 to 520-740. Signed-off-by: Constantine Shulyupin <const@MakeLinux.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2018-09-06 20:37:06 +08:00
return min_t(unsigned int,
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
((pos + len - 1) >> PAGE_SHIFT) -
(pos >> PAGE_SHIFT) + 1,
fuse: add max_pages to init_out Replace FUSE_MAX_PAGES_PER_REQ with the configurable parameter max_pages to improve performance. Old RFC with detailed description of the problem and many fixes by Mitsuo Hayasaka (mitsuo.hayasaka.hu@hitachi.com): - https://lkml.org/lkml/2012/7/5/136 We've encountered performance degradation and fixed it on a big and complex virtual environment. Environment to reproduce degradation and improvement: 1. Add lag to user mode FUSE Add nanosleep(&(struct timespec){ 0, 1000 }, NULL); to xmp_write_buf in passthrough_fh.c 2. patch UM fuse with configurable max_pages parameter. The patch will be provided latter. 3. run test script and perform test on tmpfs fuse_test() { cd /tmp mkdir -p fusemnt passthrough_fh -o max_pages=$1 /tmp/fusemnt grep fuse /proc/self/mounts dd conv=fdatasync oflag=dsync if=/dev/zero of=fusemnt/tmp/tmp \ count=1K bs=1M 2>&1 | grep -v records rm fusemnt/tmp/tmp killall passthrough_fh } Test results: passthrough_fh /tmp/fusemnt fuse.passthrough_fh \ rw,nosuid,nodev,relatime,user_id=0,group_id=0 0 0 1073741824 bytes (1.1 GB) copied, 1.73867 s, 618 MB/s passthrough_fh /tmp/fusemnt fuse.passthrough_fh \ rw,nosuid,nodev,relatime,user_id=0,group_id=0,max_pages=256 0 0 1073741824 bytes (1.1 GB) copied, 1.15643 s, 928 MB/s Obviously with bigger lag the difference between 'before' and 'after' will be more significant. Mitsuo Hayasaka, in 2012 (https://lkml.org/lkml/2012/7/5/136), observed improvement from 400-550 to 520-740. Signed-off-by: Constantine Shulyupin <const@MakeLinux.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2018-09-06 20:37:06 +08:00
max_pages);
}
static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
fuse: hotfix truncate_pagecache() issue The way how fuse calls truncate_pagecache() from fuse_change_attributes() is completely wrong. Because, w/o i_mutex held, we never sure whether 'oldsize' and 'attr->size' are valid by the time of execution of truncate_pagecache(inode, oldsize, attr->size). In fact, as soon as we released fc->lock in the middle of fuse_change_attributes(), we completely loose control of actions which may happen with given inode until we reach truncate_pagecache. The list of potentially dangerous actions includes mmap-ed reads and writes, ftruncate(2) and write(2) extending file size. The typical outcome of doing truncate_pagecache() with outdated arguments is data corruption from user point of view. This is (in some sense) acceptable in cases when the issue is triggered by a change of the file on the server (i.e. externally wrt fuse operation), but it is absolutely intolerable in scenarios when a single fuse client modifies a file without any external intervention. A real life case I discovered by fsx-linux looked like this: 1. Shrinking ftruncate(2) comes to fuse_do_setattr(). The latter sends FUSE_SETATTR to the server synchronously, but before getting fc->lock ... 2. fuse_dentry_revalidate() is asynchronously called. It sends FUSE_LOOKUP to the server synchronously, then calls fuse_change_attributes(). The latter updates i_size, releases fc->lock, but before comparing oldsize vs attr->size.. 3. fuse_do_setattr() from the first step proceeds by acquiring fc->lock and updating attributes and i_size, but now oldsize is equal to outarg.attr.size because i_size has just been updated (step 2). Hence, fuse_do_setattr() returns w/o calling truncate_pagecache(). 4. As soon as ftruncate(2) completes, the user extends file size by write(2) making a hole in the middle of file, then reads data from the hole either by read(2) or mmap-ed read. The user expects to get zero data from the hole, but gets stale data because truncate_pagecache() is not executed yet. The scenario above illustrates one side of the problem: not truncating the page cache even though we should. Another side corresponds to truncating page cache too late, when the state of inode changed significantly. Theoretically, the following is possible: 1. As in the previous scenario fuse_dentry_revalidate() discovered that i_size changed (due to our own fuse_do_setattr()) and is going to call truncate_pagecache() for some 'new_size' it believes valid right now. But by the time that particular truncate_pagecache() is called ... 2. fuse_do_setattr() returns (either having called truncate_pagecache() or not -- it doesn't matter). 3. The file is extended either by write(2) or ftruncate(2) or fallocate(2). 4. mmap-ed write makes a page in the extended region dirty. The result will be the lost of data user wrote on the fourth step. The patch is a hotfix resolving the issue in a simplistic way: let's skip dangerous i_size update and truncate_pagecache if an operation changing file size is in progress. This simplistic approach looks correct for the cases w/o external changes. And to handle them properly, more sophisticated and intrusive techniques (e.g. NFS-like one) would be required. I'd like to postpone it until the issue is well discussed on the mailing list(s). Changed in v2: - improved patch description to cover both sides of the issue. Signed-off-by: Maxim Patlasov <mpatlasov@parallels.com> Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: stable@vger.kernel.org
2013-08-30 21:06:04 +08:00
struct fuse_inode *fi = get_fuse_inode(inode);
loff_t pos = iocb->ki_pos;
int err = 0;
ssize_t res = 0;
fuse: hotfix truncate_pagecache() issue The way how fuse calls truncate_pagecache() from fuse_change_attributes() is completely wrong. Because, w/o i_mutex held, we never sure whether 'oldsize' and 'attr->size' are valid by the time of execution of truncate_pagecache(inode, oldsize, attr->size). In fact, as soon as we released fc->lock in the middle of fuse_change_attributes(), we completely loose control of actions which may happen with given inode until we reach truncate_pagecache. The list of potentially dangerous actions includes mmap-ed reads and writes, ftruncate(2) and write(2) extending file size. The typical outcome of doing truncate_pagecache() with outdated arguments is data corruption from user point of view. This is (in some sense) acceptable in cases when the issue is triggered by a change of the file on the server (i.e. externally wrt fuse operation), but it is absolutely intolerable in scenarios when a single fuse client modifies a file without any external intervention. A real life case I discovered by fsx-linux looked like this: 1. Shrinking ftruncate(2) comes to fuse_do_setattr(). The latter sends FUSE_SETATTR to the server synchronously, but before getting fc->lock ... 2. fuse_dentry_revalidate() is asynchronously called. It sends FUSE_LOOKUP to the server synchronously, then calls fuse_change_attributes(). The latter updates i_size, releases fc->lock, but before comparing oldsize vs attr->size.. 3. fuse_do_setattr() from the first step proceeds by acquiring fc->lock and updating attributes and i_size, but now oldsize is equal to outarg.attr.size because i_size has just been updated (step 2). Hence, fuse_do_setattr() returns w/o calling truncate_pagecache(). 4. As soon as ftruncate(2) completes, the user extends file size by write(2) making a hole in the middle of file, then reads data from the hole either by read(2) or mmap-ed read. The user expects to get zero data from the hole, but gets stale data because truncate_pagecache() is not executed yet. The scenario above illustrates one side of the problem: not truncating the page cache even though we should. Another side corresponds to truncating page cache too late, when the state of inode changed significantly. Theoretically, the following is possible: 1. As in the previous scenario fuse_dentry_revalidate() discovered that i_size changed (due to our own fuse_do_setattr()) and is going to call truncate_pagecache() for some 'new_size' it believes valid right now. But by the time that particular truncate_pagecache() is called ... 2. fuse_do_setattr() returns (either having called truncate_pagecache() or not -- it doesn't matter). 3. The file is extended either by write(2) or ftruncate(2) or fallocate(2). 4. mmap-ed write makes a page in the extended region dirty. The result will be the lost of data user wrote on the fourth step. The patch is a hotfix resolving the issue in a simplistic way: let's skip dangerous i_size update and truncate_pagecache if an operation changing file size is in progress. This simplistic approach looks correct for the cases w/o external changes. And to handle them properly, more sophisticated and intrusive techniques (e.g. NFS-like one) would be required. I'd like to postpone it until the issue is well discussed on the mailing list(s). Changed in v2: - improved patch description to cover both sides of the issue. Signed-off-by: Maxim Patlasov <mpatlasov@parallels.com> Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: stable@vger.kernel.org
2013-08-30 21:06:04 +08:00
if (inode->i_size < pos + iov_iter_count(ii))
set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
do {
ssize_t count;
struct fuse_io_args ia = {};
struct fuse_args_pages *ap = &ia.ap;
fuse: add max_pages to init_out Replace FUSE_MAX_PAGES_PER_REQ with the configurable parameter max_pages to improve performance. Old RFC with detailed description of the problem and many fixes by Mitsuo Hayasaka (mitsuo.hayasaka.hu@hitachi.com): - https://lkml.org/lkml/2012/7/5/136 We've encountered performance degradation and fixed it on a big and complex virtual environment. Environment to reproduce degradation and improvement: 1. Add lag to user mode FUSE Add nanosleep(&(struct timespec){ 0, 1000 }, NULL); to xmp_write_buf in passthrough_fh.c 2. patch UM fuse with configurable max_pages parameter. The patch will be provided latter. 3. run test script and perform test on tmpfs fuse_test() { cd /tmp mkdir -p fusemnt passthrough_fh -o max_pages=$1 /tmp/fusemnt grep fuse /proc/self/mounts dd conv=fdatasync oflag=dsync if=/dev/zero of=fusemnt/tmp/tmp \ count=1K bs=1M 2>&1 | grep -v records rm fusemnt/tmp/tmp killall passthrough_fh } Test results: passthrough_fh /tmp/fusemnt fuse.passthrough_fh \ rw,nosuid,nodev,relatime,user_id=0,group_id=0 0 0 1073741824 bytes (1.1 GB) copied, 1.73867 s, 618 MB/s passthrough_fh /tmp/fusemnt fuse.passthrough_fh \ rw,nosuid,nodev,relatime,user_id=0,group_id=0,max_pages=256 0 0 1073741824 bytes (1.1 GB) copied, 1.15643 s, 928 MB/s Obviously with bigger lag the difference between 'before' and 'after' will be more significant. Mitsuo Hayasaka, in 2012 (https://lkml.org/lkml/2012/7/5/136), observed improvement from 400-550 to 520-740. Signed-off-by: Constantine Shulyupin <const@MakeLinux.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2018-09-06 20:37:06 +08:00
unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii),
fc->max_pages);
ap->pages = fuse_pages_alloc(nr_pages, GFP_KERNEL, &ap->descs);
if (!ap->pages) {
err = -ENOMEM;
break;
}
fuse: fix write deadlock There are two modes for write(2) and friends in fuse: a) write through (update page cache, send sync WRITE request to userspace) b) buffered write (update page cache, async writeout later) The write through method kept all the page cache pages locked that were used for the request. Keeping more than one page locked is deadlock prone and Qian Cai demonstrated this with trinity fuzzing. The reason for keeping the pages locked is that concurrent mapped reads shouldn't try to pull possibly stale data into the page cache. For full page writes, the easy way to fix this is to make the cached page be the authoritative source by marking the page PG_uptodate immediately. After this the page can be safely unlocked, since mapped/cached reads will take the written data from the cache. Concurrent mapped writes will now cause data in the original WRITE request to be updated; this however doesn't cause any data inconsistency and this scenario should be exceedingly rare anyway. If the WRITE request returns with an error in the above case, currently the page is not marked uptodate; this means that a concurrent read will always read consistent data. After this patch the page is uptodate between writing to the cache and receiving the error: there's window where a cached read will read the wrong data. While theoretically this could be a regression, it is unlikely to be one in practice, since this is normal for buffered writes. In case of a partial page write to an already uptodate page the locking is also unnecessary, with the above caveats. Partial write of a not uptodate page still needs to be handled. One way would be to read the complete page before doing the write. This is not possible, since it might break filesystems that don't expect any READ requests when the file was opened O_WRONLY. The other solution is to serialize the synchronous write with reads from the partial pages. The easiest way to do this is to keep the partial pages locked. The problem is that a write() may involve two such pages (one head and one tail). This patch fixes it by only locking the partial tail page. If there's a partial head page as well, then split that off as a separate WRITE request. Reported-by: Qian Cai <cai@lca.pw> Link: https://lore.kernel.org/linux-fsdevel/4794a3fa3742a5e84fb0f934944204b55730829b.camel@lca.pw/ Fixes: ea9b9907b82a ("fuse: implement perform_write") Cc: <stable@vger.kernel.org> # v2.6.26 Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-10-22 04:12:49 +08:00
count = fuse_fill_write_pages(&ia, mapping, ii, pos, nr_pages);
if (count <= 0) {
err = count;
} else {
err = fuse_send_write_pages(&ia, iocb, inode,
pos, count);
if (!err) {
size_t num_written = ia.write.out.size;
res += num_written;
pos += num_written;
/* break out of the loop on short write */
if (num_written != count)
err = -EIO;
}
}
kfree(ap->pages);
} while (!err && iov_iter_count(ii));
fuse_write_update_attr(inode, pos, res);
fuse: hotfix truncate_pagecache() issue The way how fuse calls truncate_pagecache() from fuse_change_attributes() is completely wrong. Because, w/o i_mutex held, we never sure whether 'oldsize' and 'attr->size' are valid by the time of execution of truncate_pagecache(inode, oldsize, attr->size). In fact, as soon as we released fc->lock in the middle of fuse_change_attributes(), we completely loose control of actions which may happen with given inode until we reach truncate_pagecache. The list of potentially dangerous actions includes mmap-ed reads and writes, ftruncate(2) and write(2) extending file size. The typical outcome of doing truncate_pagecache() with outdated arguments is data corruption from user point of view. This is (in some sense) acceptable in cases when the issue is triggered by a change of the file on the server (i.e. externally wrt fuse operation), but it is absolutely intolerable in scenarios when a single fuse client modifies a file without any external intervention. A real life case I discovered by fsx-linux looked like this: 1. Shrinking ftruncate(2) comes to fuse_do_setattr(). The latter sends FUSE_SETATTR to the server synchronously, but before getting fc->lock ... 2. fuse_dentry_revalidate() is asynchronously called. It sends FUSE_LOOKUP to the server synchronously, then calls fuse_change_attributes(). The latter updates i_size, releases fc->lock, but before comparing oldsize vs attr->size.. 3. fuse_do_setattr() from the first step proceeds by acquiring fc->lock and updating attributes and i_size, but now oldsize is equal to outarg.attr.size because i_size has just been updated (step 2). Hence, fuse_do_setattr() returns w/o calling truncate_pagecache(). 4. As soon as ftruncate(2) completes, the user extends file size by write(2) making a hole in the middle of file, then reads data from the hole either by read(2) or mmap-ed read. The user expects to get zero data from the hole, but gets stale data because truncate_pagecache() is not executed yet. The scenario above illustrates one side of the problem: not truncating the page cache even though we should. Another side corresponds to truncating page cache too late, when the state of inode changed significantly. Theoretically, the following is possible: 1. As in the previous scenario fuse_dentry_revalidate() discovered that i_size changed (due to our own fuse_do_setattr()) and is going to call truncate_pagecache() for some 'new_size' it believes valid right now. But by the time that particular truncate_pagecache() is called ... 2. fuse_do_setattr() returns (either having called truncate_pagecache() or not -- it doesn't matter). 3. The file is extended either by write(2) or ftruncate(2) or fallocate(2). 4. mmap-ed write makes a page in the extended region dirty. The result will be the lost of data user wrote on the fourth step. The patch is a hotfix resolving the issue in a simplistic way: let's skip dangerous i_size update and truncate_pagecache if an operation changing file size is in progress. This simplistic approach looks correct for the cases w/o external changes. And to handle them properly, more sophisticated and intrusive techniques (e.g. NFS-like one) would be required. I'd like to postpone it until the issue is well discussed on the mailing list(s). Changed in v2: - improved patch description to cover both sides of the issue. Signed-off-by: Maxim Patlasov <mpatlasov@parallels.com> Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: stable@vger.kernel.org
2013-08-30 21:06:04 +08:00
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
if (!res)
return err;
iocb->ki_pos += res;
return res;
}
static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
ssize_t written = 0;
struct inode *inode = mapping->host;
ssize_t err;
fuse: don't send ATTR_MODE to kill suid/sgid for handle_killpriv_v2 If client does a write() on a suid/sgid file, VFS will first call fuse_setattr() with ATTR_KILL_S[UG]ID set. This requires sending setattr to file server with ATTR_MODE set to kill suid/sgid. But to do that client needs to know latest mode otherwise it is racy. To reduce the race window, current code first call fuse_do_getattr() to get latest ->i_mode and then resets suid/sgid bits and sends rest to server with setattr(ATTR_MODE). This does not reduce the race completely but narrows race window significantly. With fc->handle_killpriv_v2 enabled, it should be possible to remove this race completely. Do not kill suid/sgid with ATTR_MODE at all. It will be killed by server when WRITE request is sent to server soon. This is similar to fc->handle_killpriv logic. V2 is just more refined version of protocol. Hence this patch does not send ATTR_MODE to kill suid/sgid if fc->handle_killpriv_v2 is enabled. This creates an issue if fc->writeback_cache is enabled. In that case WRITE can be cached in guest and server might not see WRITE request and hence will not kill suid/sgid. Miklos suggested that in such cases, we should fallback to a writethrough WRITE instead and that will generate WRITE request and kill suid/sgid. This patch implements that too. But this relies on client seeing the suid/sgid set. If another client sets suid/sgid and this client does not see it immideately, then we will not fallback to writethrough WRITE. So this is one limitation with both fc->handle_killpriv_v2 and fc->writeback_cache enabled. Both the options are not fully compatible. But might be good enough for many use cases. Note: This patch is not checking whether security.capability is set or not when falling back to writethrough path. If suid/sgid is not set and only security.capability is set, that will be taken care of by file_remove_privs() call in ->writeback_cache path. Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-10-10 02:15:10 +08:00
struct fuse_conn *fc = get_fuse_conn(inode);
fuse: don't send ATTR_MODE to kill suid/sgid for handle_killpriv_v2 If client does a write() on a suid/sgid file, VFS will first call fuse_setattr() with ATTR_KILL_S[UG]ID set. This requires sending setattr to file server with ATTR_MODE set to kill suid/sgid. But to do that client needs to know latest mode otherwise it is racy. To reduce the race window, current code first call fuse_do_getattr() to get latest ->i_mode and then resets suid/sgid bits and sends rest to server with setattr(ATTR_MODE). This does not reduce the race completely but narrows race window significantly. With fc->handle_killpriv_v2 enabled, it should be possible to remove this race completely. Do not kill suid/sgid with ATTR_MODE at all. It will be killed by server when WRITE request is sent to server soon. This is similar to fc->handle_killpriv logic. V2 is just more refined version of protocol. Hence this patch does not send ATTR_MODE to kill suid/sgid if fc->handle_killpriv_v2 is enabled. This creates an issue if fc->writeback_cache is enabled. In that case WRITE can be cached in guest and server might not see WRITE request and hence will not kill suid/sgid. Miklos suggested that in such cases, we should fallback to a writethrough WRITE instead and that will generate WRITE request and kill suid/sgid. This patch implements that too. But this relies on client seeing the suid/sgid set. If another client sets suid/sgid and this client does not see it immideately, then we will not fallback to writethrough WRITE. So this is one limitation with both fc->handle_killpriv_v2 and fc->writeback_cache enabled. Both the options are not fully compatible. But might be good enough for many use cases. Note: This patch is not checking whether security.capability is set or not when falling back to writethrough path. If suid/sgid is not set and only security.capability is set, that will be taken care of by file_remove_privs() call in ->writeback_cache path. Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-10-10 02:15:10 +08:00
if (fc->writeback_cache) {
/* Update size (EOF optimization) and mode (SUID clearing) */
err = fuse_update_attributes(mapping->host, file,
STATX_SIZE | STATX_MODE);
if (err)
return err;
fuse: don't send ATTR_MODE to kill suid/sgid for handle_killpriv_v2 If client does a write() on a suid/sgid file, VFS will first call fuse_setattr() with ATTR_KILL_S[UG]ID set. This requires sending setattr to file server with ATTR_MODE set to kill suid/sgid. But to do that client needs to know latest mode otherwise it is racy. To reduce the race window, current code first call fuse_do_getattr() to get latest ->i_mode and then resets suid/sgid bits and sends rest to server with setattr(ATTR_MODE). This does not reduce the race completely but narrows race window significantly. With fc->handle_killpriv_v2 enabled, it should be possible to remove this race completely. Do not kill suid/sgid with ATTR_MODE at all. It will be killed by server when WRITE request is sent to server soon. This is similar to fc->handle_killpriv logic. V2 is just more refined version of protocol. Hence this patch does not send ATTR_MODE to kill suid/sgid if fc->handle_killpriv_v2 is enabled. This creates an issue if fc->writeback_cache is enabled. In that case WRITE can be cached in guest and server might not see WRITE request and hence will not kill suid/sgid. Miklos suggested that in such cases, we should fallback to a writethrough WRITE instead and that will generate WRITE request and kill suid/sgid. This patch implements that too. But this relies on client seeing the suid/sgid set. If another client sets suid/sgid and this client does not see it immideately, then we will not fallback to writethrough WRITE. So this is one limitation with both fc->handle_killpriv_v2 and fc->writeback_cache enabled. Both the options are not fully compatible. But might be good enough for many use cases. Note: This patch is not checking whether security.capability is set or not when falling back to writethrough path. If suid/sgid is not set and only security.capability is set, that will be taken care of by file_remove_privs() call in ->writeback_cache path. Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-10-10 02:15:10 +08:00
if (fc->handle_killpriv_v2 &&
setattr_should_drop_suidgid(&nop_mnt_idmap,
file_inode(file))) {
fuse: don't send ATTR_MODE to kill suid/sgid for handle_killpriv_v2 If client does a write() on a suid/sgid file, VFS will first call fuse_setattr() with ATTR_KILL_S[UG]ID set. This requires sending setattr to file server with ATTR_MODE set to kill suid/sgid. But to do that client needs to know latest mode otherwise it is racy. To reduce the race window, current code first call fuse_do_getattr() to get latest ->i_mode and then resets suid/sgid bits and sends rest to server with setattr(ATTR_MODE). This does not reduce the race completely but narrows race window significantly. With fc->handle_killpriv_v2 enabled, it should be possible to remove this race completely. Do not kill suid/sgid with ATTR_MODE at all. It will be killed by server when WRITE request is sent to server soon. This is similar to fc->handle_killpriv logic. V2 is just more refined version of protocol. Hence this patch does not send ATTR_MODE to kill suid/sgid if fc->handle_killpriv_v2 is enabled. This creates an issue if fc->writeback_cache is enabled. In that case WRITE can be cached in guest and server might not see WRITE request and hence will not kill suid/sgid. Miklos suggested that in such cases, we should fallback to a writethrough WRITE instead and that will generate WRITE request and kill suid/sgid. This patch implements that too. But this relies on client seeing the suid/sgid set. If another client sets suid/sgid and this client does not see it immideately, then we will not fallback to writethrough WRITE. So this is one limitation with both fc->handle_killpriv_v2 and fc->writeback_cache enabled. Both the options are not fully compatible. But might be good enough for many use cases. Note: This patch is not checking whether security.capability is set or not when falling back to writethrough path. If suid/sgid is not set and only security.capability is set, that will be taken care of by file_remove_privs() call in ->writeback_cache path. Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-10-10 02:15:10 +08:00
goto writethrough;
}
return generic_file_write_iter(iocb, from);
}
fuse: don't send ATTR_MODE to kill suid/sgid for handle_killpriv_v2 If client does a write() on a suid/sgid file, VFS will first call fuse_setattr() with ATTR_KILL_S[UG]ID set. This requires sending setattr to file server with ATTR_MODE set to kill suid/sgid. But to do that client needs to know latest mode otherwise it is racy. To reduce the race window, current code first call fuse_do_getattr() to get latest ->i_mode and then resets suid/sgid bits and sends rest to server with setattr(ATTR_MODE). This does not reduce the race completely but narrows race window significantly. With fc->handle_killpriv_v2 enabled, it should be possible to remove this race completely. Do not kill suid/sgid with ATTR_MODE at all. It will be killed by server when WRITE request is sent to server soon. This is similar to fc->handle_killpriv logic. V2 is just more refined version of protocol. Hence this patch does not send ATTR_MODE to kill suid/sgid if fc->handle_killpriv_v2 is enabled. This creates an issue if fc->writeback_cache is enabled. In that case WRITE can be cached in guest and server might not see WRITE request and hence will not kill suid/sgid. Miklos suggested that in such cases, we should fallback to a writethrough WRITE instead and that will generate WRITE request and kill suid/sgid. This patch implements that too. But this relies on client seeing the suid/sgid set. If another client sets suid/sgid and this client does not see it immideately, then we will not fallback to writethrough WRITE. So this is one limitation with both fc->handle_killpriv_v2 and fc->writeback_cache enabled. Both the options are not fully compatible. But might be good enough for many use cases. Note: This patch is not checking whether security.capability is set or not when falling back to writethrough path. If suid/sgid is not set and only security.capability is set, that will be taken care of by file_remove_privs() call in ->writeback_cache path. Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-10-10 02:15:10 +08:00
writethrough:
inode_lock(inode);
err = generic_write_checks(iocb, from);
if (err <= 0)
goto out;
err = file_remove_privs(file);
if (err)
goto out;
err = file_update_time(file);
if (err)
goto out;
if (iocb->ki_flags & IOCB_DIRECT) {
written = generic_file_direct_write(iocb, from);
if (written < 0 || !iov_iter_count(from))
goto out;
written = direct_write_fallback(iocb, from, written,
fuse_perform_write(iocb, from));
} else {
written = fuse_perform_write(iocb, from);
}
out:
inode_unlock(inode);
if (written > 0)
written = generic_write_sync(iocb, written);
return written ? written : err;
}
static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
{
return (unsigned long)iter_iov(ii)->iov_base + ii->iov_offset;
}
static inline size_t fuse_get_frag_size(const struct iov_iter *ii,
size_t max_size)
{
return min(iov_iter_single_seg_count(ii), max_size);
}
static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
size_t *nbytesp, int write,
unsigned int max_pages)
{
size_t nbytes = 0; /* # bytes already packed in req */
ssize_t ret = 0;
/* Special case for kernel I/O: can copy directly into the buffer */
if (iov_iter_is_kvec(ii)) {
unsigned long user_addr = fuse_get_user_addr(ii);
size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
if (write)
ap->args.in_args[1].value = (void *) user_addr;
else
ap->args.out_args[0].value = (void *) user_addr;
iov_iter_advance(ii, frag_size);
*nbytesp = frag_size;
return 0;
}
while (nbytes < *nbytesp && ap->num_pages < max_pages) {
unsigned npages;
size_t start;
ret = iov_iter_get_pages2(ii, &ap->pages[ap->num_pages],
*nbytesp - nbytes,
max_pages - ap->num_pages,
&start);
if (ret < 0)
break;
nbytes += ret;
ret += start;
npages = DIV_ROUND_UP(ret, PAGE_SIZE);
ap->descs[ap->num_pages].offset = start;
fuse_page_descs_length_init(ap->descs, ap->num_pages, npages);
ap->num_pages += npages;
ap->descs[ap->num_pages - 1].length -=
(PAGE_SIZE - ret) & (PAGE_SIZE - 1);
}
ap->args.user_pages = true;
if (write)
ap->args.in_pages = true;
else
ap->args.out_pages = true;
*nbytesp = nbytes;
return ret < 0 ? ret : 0;
}
ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
loff_t *ppos, int flags)
{
int write = flags & FUSE_DIO_WRITE;
int cuse = flags & FUSE_DIO_CUSE;
struct file *file = io->iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = ff->fm->fc;
size_t nmax = write ? fc->max_write : fc->max_read;
loff_t pos = *ppos;
size_t count = iov_iter_count(iter);
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
pgoff_t idx_from = pos >> PAGE_SHIFT;
pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT;
ssize_t res = 0;
int err = 0;
struct fuse_io_args *ia;
unsigned int max_pages;
bool fopen_direct_io = ff->open_flags & FOPEN_DIRECT_IO;
max_pages = iov_iter_npages(iter, fc->max_pages);
ia = fuse_io_alloc(io, max_pages);
if (!ia)
return -ENOMEM;
if (fopen_direct_io && fc->direct_io_allow_mmap) {
res = filemap_write_and_wait_range(mapping, pos, pos + count - 1);
if (res) {
fuse_io_free(ia);
return res;
}
}
if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
if (!write)
inode_lock(inode);
fuse_sync_writes(inode);
if (!write)
inode_unlock(inode);
}
if (fopen_direct_io && write) {
res = invalidate_inode_pages2_range(mapping, idx_from, idx_to);
if (res) {
fuse_io_free(ia);
return res;
}
}
io->should_dirty = !write && user_backed_iter(iter);
while (count) {
ssize_t nres;
fl_owner_t owner = current->files;
size_t nbytes = min(count, nmax);
err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write,
max_pages);
if (err && !nbytes)
break;
if (write) {
if (!capable(CAP_FSETID))
ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID;
nres = fuse_send_write(ia, pos, nbytes, owner);
} else {
nres = fuse_send_read(ia, pos, nbytes, owner);
}
if (!io->async || nres < 0) {
fuse_release_user_pages(&ia->ap, io->should_dirty);
fuse_io_free(ia);
}
ia = NULL;
if (nres < 0) {
iov_iter_revert(iter, nbytes);
err = nres;
break;
}
WARN_ON(nres > nbytes);
count -= nres;
res += nres;
pos += nres;
if (nres != nbytes) {
iov_iter_revert(iter, nbytes - nres);
break;
}
if (count) {
max_pages = iov_iter_npages(iter, fc->max_pages);
ia = fuse_io_alloc(io, max_pages);
if (!ia)
break;
}
}
if (ia)
fuse_io_free(ia);
if (res > 0)
*ppos = pos;
return res > 0 ? res : err;
}
EXPORT_SYMBOL_GPL(fuse_direct_io);
static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
struct iov_iter *iter,
loff_t *ppos)
{
ssize_t res;
struct inode *inode = file_inode(io->iocb->ki_filp);
res = fuse_direct_io(io, iter, ppos, 0);
fuse_invalidate_atime(inode);
return res;
}
static ssize_t fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
ssize_t res;
if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
res = fuse_direct_IO(iocb, to);
} else {
struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
res = __fuse_direct_read(&io, to, &iocb->ki_pos);
}
return res;
}
fuse: allow non-extending parallel direct writes on the same file In general, as of now, in FUSE, direct writes on the same file are serialized over inode lock i.e we hold inode lock for the full duration of the write request. I could not find in fuse code and git history a comment which clearly explains why this exclusive lock is taken for direct writes. Following might be the reasons for acquiring an exclusive lock but not be limited to 1) Our guess is some USER space fuse implementations might be relying on this lock for serialization. 2) The lock protects against file read/write size races. 3) Ruling out any issues arising from partial write failures. This patch relaxes the exclusive lock for direct non-extending writes only. File size extending writes might not need the lock either, but we are not entirely sure if there is a risk to introduce any kind of regression. Furthermore, benchmarking with fio does not show a difference between patch versions that take on file size extension a) an exclusive lock and b) a shared lock. A possible example of an issue with i_size extending writes are write error cases. Some writes might succeed and others might fail for file system internal reasons - for example ENOSPACE. With parallel file size extending writes it _might_ be difficult to revert the action of the failing write, especially to restore the right i_size. With these changes, we allow non-extending parallel direct writes on the same file with the help of a flag called FOPEN_PARALLEL_DIRECT_WRITES. If this flag is set on the file (flag is passed from libfuse to fuse kernel as part of file open/create), we do not take exclusive lock anymore, but instead use a shared lock that allows non-extending writes to run in parallel. FUSE implementations which rely on this inode lock for serialization can continue to do so and serialized direct writes are still the default. Implementations that do not do write serialization need to be updated and need to set the FOPEN_PARALLEL_DIRECT_WRITES flag in their file open/create reply. On patch review there were concerns that network file systems (or vfs multiple mounts of the same file system) might have issues with parallel writes. We believe this is not the case, as this is just a local lock, which network file systems could not rely on anyway. I.e. this lock is just for local consistency. Signed-off-by: Dharmendra Singh <dsingh@ddn.com> Signed-off-by: Bernd Schubert <bschubert@ddn.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2022-06-17 15:10:27 +08:00
static bool fuse_direct_write_extending_i_size(struct kiocb *iocb,
struct iov_iter *iter)
{
struct inode *inode = file_inode(iocb->ki_filp);
return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode);
}
static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct inode *inode = file_inode(iocb->ki_filp);
fuse: allow non-extending parallel direct writes on the same file In general, as of now, in FUSE, direct writes on the same file are serialized over inode lock i.e we hold inode lock for the full duration of the write request. I could not find in fuse code and git history a comment which clearly explains why this exclusive lock is taken for direct writes. Following might be the reasons for acquiring an exclusive lock but not be limited to 1) Our guess is some USER space fuse implementations might be relying on this lock for serialization. 2) The lock protects against file read/write size races. 3) Ruling out any issues arising from partial write failures. This patch relaxes the exclusive lock for direct non-extending writes only. File size extending writes might not need the lock either, but we are not entirely sure if there is a risk to introduce any kind of regression. Furthermore, benchmarking with fio does not show a difference between patch versions that take on file size extension a) an exclusive lock and b) a shared lock. A possible example of an issue with i_size extending writes are write error cases. Some writes might succeed and others might fail for file system internal reasons - for example ENOSPACE. With parallel file size extending writes it _might_ be difficult to revert the action of the failing write, especially to restore the right i_size. With these changes, we allow non-extending parallel direct writes on the same file with the help of a flag called FOPEN_PARALLEL_DIRECT_WRITES. If this flag is set on the file (flag is passed from libfuse to fuse kernel as part of file open/create), we do not take exclusive lock anymore, but instead use a shared lock that allows non-extending writes to run in parallel. FUSE implementations which rely on this inode lock for serialization can continue to do so and serialized direct writes are still the default. Implementations that do not do write serialization need to be updated and need to set the FOPEN_PARALLEL_DIRECT_WRITES flag in their file open/create reply. On patch review there were concerns that network file systems (or vfs multiple mounts of the same file system) might have issues with parallel writes. We believe this is not the case, as this is just a local lock, which network file systems could not rely on anyway. I.e. this lock is just for local consistency. Signed-off-by: Dharmendra Singh <dsingh@ddn.com> Signed-off-by: Bernd Schubert <bschubert@ddn.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2022-06-17 15:10:27 +08:00
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
ssize_t res;
fuse: allow non-extending parallel direct writes on the same file In general, as of now, in FUSE, direct writes on the same file are serialized over inode lock i.e we hold inode lock for the full duration of the write request. I could not find in fuse code and git history a comment which clearly explains why this exclusive lock is taken for direct writes. Following might be the reasons for acquiring an exclusive lock but not be limited to 1) Our guess is some USER space fuse implementations might be relying on this lock for serialization. 2) The lock protects against file read/write size races. 3) Ruling out any issues arising from partial write failures. This patch relaxes the exclusive lock for direct non-extending writes only. File size extending writes might not need the lock either, but we are not entirely sure if there is a risk to introduce any kind of regression. Furthermore, benchmarking with fio does not show a difference between patch versions that take on file size extension a) an exclusive lock and b) a shared lock. A possible example of an issue with i_size extending writes are write error cases. Some writes might succeed and others might fail for file system internal reasons - for example ENOSPACE. With parallel file size extending writes it _might_ be difficult to revert the action of the failing write, especially to restore the right i_size. With these changes, we allow non-extending parallel direct writes on the same file with the help of a flag called FOPEN_PARALLEL_DIRECT_WRITES. If this flag is set on the file (flag is passed from libfuse to fuse kernel as part of file open/create), we do not take exclusive lock anymore, but instead use a shared lock that allows non-extending writes to run in parallel. FUSE implementations which rely on this inode lock for serialization can continue to do so and serialized direct writes are still the default. Implementations that do not do write serialization need to be updated and need to set the FOPEN_PARALLEL_DIRECT_WRITES flag in their file open/create reply. On patch review there were concerns that network file systems (or vfs multiple mounts of the same file system) might have issues with parallel writes. We believe this is not the case, as this is just a local lock, which network file systems could not rely on anyway. I.e. this lock is just for local consistency. Signed-off-by: Dharmendra Singh <dsingh@ddn.com> Signed-off-by: Bernd Schubert <bschubert@ddn.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2022-06-17 15:10:27 +08:00
bool exclusive_lock =
!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) ||
get_fuse_conn(inode)->direct_io_allow_mmap ||
fuse: allow non-extending parallel direct writes on the same file In general, as of now, in FUSE, direct writes on the same file are serialized over inode lock i.e we hold inode lock for the full duration of the write request. I could not find in fuse code and git history a comment which clearly explains why this exclusive lock is taken for direct writes. Following might be the reasons for acquiring an exclusive lock but not be limited to 1) Our guess is some USER space fuse implementations might be relying on this lock for serialization. 2) The lock protects against file read/write size races. 3) Ruling out any issues arising from partial write failures. This patch relaxes the exclusive lock for direct non-extending writes only. File size extending writes might not need the lock either, but we are not entirely sure if there is a risk to introduce any kind of regression. Furthermore, benchmarking with fio does not show a difference between patch versions that take on file size extension a) an exclusive lock and b) a shared lock. A possible example of an issue with i_size extending writes are write error cases. Some writes might succeed and others might fail for file system internal reasons - for example ENOSPACE. With parallel file size extending writes it _might_ be difficult to revert the action of the failing write, especially to restore the right i_size. With these changes, we allow non-extending parallel direct writes on the same file with the help of a flag called FOPEN_PARALLEL_DIRECT_WRITES. If this flag is set on the file (flag is passed from libfuse to fuse kernel as part of file open/create), we do not take exclusive lock anymore, but instead use a shared lock that allows non-extending writes to run in parallel. FUSE implementations which rely on this inode lock for serialization can continue to do so and serialized direct writes are still the default. Implementations that do not do write serialization need to be updated and need to set the FOPEN_PARALLEL_DIRECT_WRITES flag in their file open/create reply. On patch review there were concerns that network file systems (or vfs multiple mounts of the same file system) might have issues with parallel writes. We believe this is not the case, as this is just a local lock, which network file systems could not rely on anyway. I.e. this lock is just for local consistency. Signed-off-by: Dharmendra Singh <dsingh@ddn.com> Signed-off-by: Bernd Schubert <bschubert@ddn.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2022-06-17 15:10:27 +08:00
iocb->ki_flags & IOCB_APPEND ||
fuse_direct_write_extending_i_size(iocb, from);
/*
* Take exclusive lock if
* - Parallel direct writes are disabled - a user space decision
* - Parallel direct writes are enabled and i_size is being extended.
* - Shared mmap on direct_io file is supported (FUSE_DIRECT_IO_ALLOW_MMAP).
fuse: allow non-extending parallel direct writes on the same file In general, as of now, in FUSE, direct writes on the same file are serialized over inode lock i.e we hold inode lock for the full duration of the write request. I could not find in fuse code and git history a comment which clearly explains why this exclusive lock is taken for direct writes. Following might be the reasons for acquiring an exclusive lock but not be limited to 1) Our guess is some USER space fuse implementations might be relying on this lock for serialization. 2) The lock protects against file read/write size races. 3) Ruling out any issues arising from partial write failures. This patch relaxes the exclusive lock for direct non-extending writes only. File size extending writes might not need the lock either, but we are not entirely sure if there is a risk to introduce any kind of regression. Furthermore, benchmarking with fio does not show a difference between patch versions that take on file size extension a) an exclusive lock and b) a shared lock. A possible example of an issue with i_size extending writes are write error cases. Some writes might succeed and others might fail for file system internal reasons - for example ENOSPACE. With parallel file size extending writes it _might_ be difficult to revert the action of the failing write, especially to restore the right i_size. With these changes, we allow non-extending parallel direct writes on the same file with the help of a flag called FOPEN_PARALLEL_DIRECT_WRITES. If this flag is set on the file (flag is passed from libfuse to fuse kernel as part of file open/create), we do not take exclusive lock anymore, but instead use a shared lock that allows non-extending writes to run in parallel. FUSE implementations which rely on this inode lock for serialization can continue to do so and serialized direct writes are still the default. Implementations that do not do write serialization need to be updated and need to set the FOPEN_PARALLEL_DIRECT_WRITES flag in their file open/create reply. On patch review there were concerns that network file systems (or vfs multiple mounts of the same file system) might have issues with parallel writes. We believe this is not the case, as this is just a local lock, which network file systems could not rely on anyway. I.e. this lock is just for local consistency. Signed-off-by: Dharmendra Singh <dsingh@ddn.com> Signed-off-by: Bernd Schubert <bschubert@ddn.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2022-06-17 15:10:27 +08:00
* This might not be needed at all, but needs further investigation.
*/
if (exclusive_lock)
inode_lock(inode);
else {
inode_lock_shared(inode);
/* A race with truncate might have come up as the decision for
* the lock type was done without holding the lock, check again.
*/
if (fuse_direct_write_extending_i_size(iocb, from)) {
inode_unlock_shared(inode);
inode_lock(inode);
exclusive_lock = true;
}
}
res = generic_write_checks(iocb, from);
if (res > 0) {
if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
res = fuse_direct_IO(iocb, from);
} else {
res = fuse_direct_io(&io, from, &iocb->ki_pos,
FUSE_DIO_WRITE);
fuse_write_update_attr(inode, iocb->ki_pos, res);
}
}
fuse: allow non-extending parallel direct writes on the same file In general, as of now, in FUSE, direct writes on the same file are serialized over inode lock i.e we hold inode lock for the full duration of the write request. I could not find in fuse code and git history a comment which clearly explains why this exclusive lock is taken for direct writes. Following might be the reasons for acquiring an exclusive lock but not be limited to 1) Our guess is some USER space fuse implementations might be relying on this lock for serialization. 2) The lock protects against file read/write size races. 3) Ruling out any issues arising from partial write failures. This patch relaxes the exclusive lock for direct non-extending writes only. File size extending writes might not need the lock either, but we are not entirely sure if there is a risk to introduce any kind of regression. Furthermore, benchmarking with fio does not show a difference between patch versions that take on file size extension a) an exclusive lock and b) a shared lock. A possible example of an issue with i_size extending writes are write error cases. Some writes might succeed and others might fail for file system internal reasons - for example ENOSPACE. With parallel file size extending writes it _might_ be difficult to revert the action of the failing write, especially to restore the right i_size. With these changes, we allow non-extending parallel direct writes on the same file with the help of a flag called FOPEN_PARALLEL_DIRECT_WRITES. If this flag is set on the file (flag is passed from libfuse to fuse kernel as part of file open/create), we do not take exclusive lock anymore, but instead use a shared lock that allows non-extending writes to run in parallel. FUSE implementations which rely on this inode lock for serialization can continue to do so and serialized direct writes are still the default. Implementations that do not do write serialization need to be updated and need to set the FOPEN_PARALLEL_DIRECT_WRITES flag in their file open/create reply. On patch review there were concerns that network file systems (or vfs multiple mounts of the same file system) might have issues with parallel writes. We believe this is not the case, as this is just a local lock, which network file systems could not rely on anyway. I.e. this lock is just for local consistency. Signed-off-by: Dharmendra Singh <dsingh@ddn.com> Signed-off-by: Bernd Schubert <bschubert@ddn.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2022-06-17 15:10:27 +08:00
if (exclusive_lock)
inode_unlock(inode);
else
inode_unlock_shared(inode);
return res;
}
static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
struct inode *inode = file_inode(file);
if (fuse_is_bad(inode))
return -EIO;
if (FUSE_IS_DAX(inode))
return fuse_dax_read_iter(iocb, to);
if (!(ff->open_flags & FOPEN_DIRECT_IO))
return fuse_cache_read_iter(iocb, to);
else
return fuse_direct_read_iter(iocb, to);
}
static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
struct inode *inode = file_inode(file);
if (fuse_is_bad(inode))
return -EIO;
if (FUSE_IS_DAX(inode))
return fuse_dax_write_iter(iocb, from);
if (!(ff->open_flags & FOPEN_DIRECT_IO))
return fuse_cache_write_iter(iocb, from);
else
return fuse_direct_write_iter(iocb, from);
}
static void fuse_writepage_free(struct fuse_writepage_args *wpa)
{
struct fuse_args_pages *ap = &wpa->ia.ap;
int i;
if (wpa->bucket)
fuse_sync_bucket_dec(wpa->bucket);
for (i = 0; i < ap->num_pages; i++)
__free_page(ap->pages[i]);
if (wpa->ia.ff)
fuse_file_put(wpa->ia.ff, false, false);
kfree(ap->pages);
kfree(wpa);
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
}
static void fuse_writepage_finish(struct fuse_mount *fm,
struct fuse_writepage_args *wpa)
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
{
struct fuse_args_pages *ap = &wpa->ia.ap;
struct inode *inode = wpa->inode;
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
struct fuse_inode *fi = get_fuse_inode(inode);
struct backing_dev_info *bdi = inode_to_bdi(inode);
int i;
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
for (i = 0; i < ap->num_pages; i++) {
dec_wb_stat(&bdi->wb, WB_WRITEBACK);
dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP);
wb_writeout_inc(&bdi->wb);
}
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
wake_up(&fi->page_waitq);
}
/* Called under fi->lock, may release and reacquire it */
static void fuse_send_writepage(struct fuse_mount *fm,
struct fuse_writepage_args *wpa, loff_t size)
__releases(fi->lock)
__acquires(fi->lock)
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
{
struct fuse_writepage_args *aux, *next;
struct fuse_inode *fi = get_fuse_inode(wpa->inode);
struct fuse_write_in *inarg = &wpa->ia.write.in;
struct fuse_args *args = &wpa->ia.ap.args;
__u64 data_size = wpa->ia.ap.num_pages * PAGE_SIZE;
int err;
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
fi->writectr++;
if (inarg->offset + data_size <= size) {
inarg->size = data_size;
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
} else if (inarg->offset < size) {
inarg->size = size - inarg->offset;
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
} else {
/* Got truncated off completely */
goto out_free;
}
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
args->in_args[1].size = inarg->size;
args->force = true;
args->nocreds = true;
err = fuse_simple_background(fm, args, GFP_ATOMIC);
if (err == -ENOMEM) {
spin_unlock(&fi->lock);
err = fuse_simple_background(fm, args, GFP_NOFS | __GFP_NOFAIL);
spin_lock(&fi->lock);
}
/* Fails on broken connection only */
if (unlikely(err))
goto out_free;
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
return;
out_free:
fi->writectr--;
rb_erase(&wpa->writepages_entry, &fi->writepages);
fuse_writepage_finish(fm, wpa);
spin_unlock(&fi->lock);
/* After fuse_writepage_finish() aux request list is private */
for (aux = wpa->next; aux; aux = next) {
next = aux->next;
aux->next = NULL;
fuse_writepage_free(aux);
}
fuse_writepage_free(wpa);
spin_lock(&fi->lock);
}
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
/*
* If fi->writectr is positive (no truncate or fsync going on) send
* all queued writepage requests.
*
* Called with fi->lock
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
*/
void fuse_flush_writepages(struct inode *inode)
__releases(fi->lock)
__acquires(fi->lock)
{
struct fuse_mount *fm = get_fuse_mount(inode);
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
struct fuse_inode *fi = get_fuse_inode(inode);
loff_t crop = i_size_read(inode);
struct fuse_writepage_args *wpa;
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
wpa = list_entry(fi->queued_writes.next,
struct fuse_writepage_args, queue_entry);
list_del_init(&wpa->queue_entry);
fuse_send_writepage(fm, wpa, crop);
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
}
}
fuse: fix warning in tree_insert() and clean up writepage insertion fuse_writepages_fill() calls tree_insert() with ap->num_pages = 0 which triggers the following warning: WARNING: CPU: 1 PID: 17211 at fs/fuse/file.c:1728 tree_insert+0xab/0xc0 [fuse] RIP: 0010:tree_insert+0xab/0xc0 [fuse] Call Trace: fuse_writepages_fill+0x5da/0x6a0 [fuse] write_cache_pages+0x171/0x470 fuse_writepages+0x8a/0x100 [fuse] do_writepages+0x43/0xe0 Fix up the warning and clean up the code around rb-tree insertion: - Rename tree_insert() to fuse_insert_writeback() and make it return the conflicting entry in case of failure - Re-add tree_insert() as a wrapper around fuse_insert_writeback() - Rename fuse_writepage_in_flight() to fuse_writepage_add() and reverse the meaning of the return value to mean + "true" in case the writepage entry was successfully added + "false" in case it was in-fligt queued on an existing writepage entry's auxiliary list or the existing writepage entry's temporary page updated Switch from fuse_find_writeback() + tree_insert() to fuse_insert_writeback() - Move setting orig_pages to before inserting/updating the entry; this may result in the orig_pages value being discarded later in case of an in-flight request - In case of a new writepage entry use fuse_writepage_add() unconditionally, only set data->wpa if the entry was added. Fixes: 6b2fb79963fb ("fuse: optimize writepages search") Reported-by: kernel test robot <rong.a.chen@intel.com> Original-path-by: Vasily Averin <vvs@virtuozzo.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-07-14 20:45:41 +08:00
static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root,
struct fuse_writepage_args *wpa)
{
pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT;
pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1;
struct rb_node **p = &root->rb_node;
struct rb_node *parent = NULL;
WARN_ON(!wpa->ia.ap.num_pages);
while (*p) {
struct fuse_writepage_args *curr;
pgoff_t curr_index;
parent = *p;
curr = rb_entry(parent, struct fuse_writepage_args,
writepages_entry);
WARN_ON(curr->inode != wpa->inode);
curr_index = curr->ia.write.in.offset >> PAGE_SHIFT;
if (idx_from >= curr_index + curr->ia.ap.num_pages)
p = &(*p)->rb_right;
else if (idx_to < curr_index)
p = &(*p)->rb_left;
else
fuse: fix warning in tree_insert() and clean up writepage insertion fuse_writepages_fill() calls tree_insert() with ap->num_pages = 0 which triggers the following warning: WARNING: CPU: 1 PID: 17211 at fs/fuse/file.c:1728 tree_insert+0xab/0xc0 [fuse] RIP: 0010:tree_insert+0xab/0xc0 [fuse] Call Trace: fuse_writepages_fill+0x5da/0x6a0 [fuse] write_cache_pages+0x171/0x470 fuse_writepages+0x8a/0x100 [fuse] do_writepages+0x43/0xe0 Fix up the warning and clean up the code around rb-tree insertion: - Rename tree_insert() to fuse_insert_writeback() and make it return the conflicting entry in case of failure - Re-add tree_insert() as a wrapper around fuse_insert_writeback() - Rename fuse_writepage_in_flight() to fuse_writepage_add() and reverse the meaning of the return value to mean + "true" in case the writepage entry was successfully added + "false" in case it was in-fligt queued on an existing writepage entry's auxiliary list or the existing writepage entry's temporary page updated Switch from fuse_find_writeback() + tree_insert() to fuse_insert_writeback() - Move setting orig_pages to before inserting/updating the entry; this may result in the orig_pages value being discarded later in case of an in-flight request - In case of a new writepage entry use fuse_writepage_add() unconditionally, only set data->wpa if the entry was added. Fixes: 6b2fb79963fb ("fuse: optimize writepages search") Reported-by: kernel test robot <rong.a.chen@intel.com> Original-path-by: Vasily Averin <vvs@virtuozzo.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-07-14 20:45:41 +08:00
return curr;
}
rb_link_node(&wpa->writepages_entry, parent, p);
rb_insert_color(&wpa->writepages_entry, root);
fuse: fix warning in tree_insert() and clean up writepage insertion fuse_writepages_fill() calls tree_insert() with ap->num_pages = 0 which triggers the following warning: WARNING: CPU: 1 PID: 17211 at fs/fuse/file.c:1728 tree_insert+0xab/0xc0 [fuse] RIP: 0010:tree_insert+0xab/0xc0 [fuse] Call Trace: fuse_writepages_fill+0x5da/0x6a0 [fuse] write_cache_pages+0x171/0x470 fuse_writepages+0x8a/0x100 [fuse] do_writepages+0x43/0xe0 Fix up the warning and clean up the code around rb-tree insertion: - Rename tree_insert() to fuse_insert_writeback() and make it return the conflicting entry in case of failure - Re-add tree_insert() as a wrapper around fuse_insert_writeback() - Rename fuse_writepage_in_flight() to fuse_writepage_add() and reverse the meaning of the return value to mean + "true" in case the writepage entry was successfully added + "false" in case it was in-fligt queued on an existing writepage entry's auxiliary list or the existing writepage entry's temporary page updated Switch from fuse_find_writeback() + tree_insert() to fuse_insert_writeback() - Move setting orig_pages to before inserting/updating the entry; this may result in the orig_pages value being discarded later in case of an in-flight request - In case of a new writepage entry use fuse_writepage_add() unconditionally, only set data->wpa if the entry was added. Fixes: 6b2fb79963fb ("fuse: optimize writepages search") Reported-by: kernel test robot <rong.a.chen@intel.com> Original-path-by: Vasily Averin <vvs@virtuozzo.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-07-14 20:45:41 +08:00
return NULL;
}
static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa)
{
WARN_ON(fuse_insert_writeback(root, wpa));
}
static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
int error)
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
{
struct fuse_writepage_args *wpa =
container_of(args, typeof(*wpa), ia.ap.args);
struct inode *inode = wpa->inode;
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
struct fuse_inode *fi = get_fuse_inode(inode);
fuse: invalidate attrs when page writeback completes In fuse when a direct/write-through write happens we invalidate attrs because that might have updated mtime/ctime on server and cached mtime/ctime will be stale. What about page writeback path. Looks like we don't invalidate attrs there. To be consistent, invalidate attrs in writeback path as well. Only exception is when writeback_cache is enabled. In that case we strust local mtime/ctime and there is no need to invalidate attrs. Recently users started experiencing failure of xfstests generic/080, geneirc/215 and generic/614 on virtiofs. This happened only newer "stat" utility and not older one. This patch fixes the issue. So what's the root cause of the issue. Here is detailed explanation. generic/080 test does mmap write to a file, closes the file and then checks if mtime has been updated or not. When file is closed, it leads to flushing of dirty pages (and that should update mtime/ctime on server). But we did not explicitly invalidate attrs after writeback finished. Still generic/080 passed so far and reason being that we invalidated atime in fuse_readpages_end(). This is called in fuse_readahead() path and always seems to trigger before mmaped write. So after mmaped write when lstat() is called, it sees that atleast one of the fields being asked for is invalid (atime) and that results in generating GETATTR to server and mtime/ctime also get updated and test passes. But newer /usr/bin/stat seems to have moved to using statx() syscall now (instead of using lstat()). And statx() allows it to query only ctime or mtime (and not rest of the basic stat fields). That means when querying for mtime, fuse_update_get_attr() sees that mtime is not invalid (only atime is invalid). So it does not generate a new GETATTR and fill stat with cached mtime/ctime. And that means updated mtime is not seen by xfstest and tests start failing. Invalidating attrs after writeback completion should solve this problem in a generic manner. Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2021-04-06 22:07:06 +08:00
struct fuse_conn *fc = get_fuse_conn(inode);
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
mapping_set_error(inode->i_mapping, error);
fuse: invalidate attrs when page writeback completes In fuse when a direct/write-through write happens we invalidate attrs because that might have updated mtime/ctime on server and cached mtime/ctime will be stale. What about page writeback path. Looks like we don't invalidate attrs there. To be consistent, invalidate attrs in writeback path as well. Only exception is when writeback_cache is enabled. In that case we strust local mtime/ctime and there is no need to invalidate attrs. Recently users started experiencing failure of xfstests generic/080, geneirc/215 and generic/614 on virtiofs. This happened only newer "stat" utility and not older one. This patch fixes the issue. So what's the root cause of the issue. Here is detailed explanation. generic/080 test does mmap write to a file, closes the file and then checks if mtime has been updated or not. When file is closed, it leads to flushing of dirty pages (and that should update mtime/ctime on server). But we did not explicitly invalidate attrs after writeback finished. Still generic/080 passed so far and reason being that we invalidated atime in fuse_readpages_end(). This is called in fuse_readahead() path and always seems to trigger before mmaped write. So after mmaped write when lstat() is called, it sees that atleast one of the fields being asked for is invalid (atime) and that results in generating GETATTR to server and mtime/ctime also get updated and test passes. But newer /usr/bin/stat seems to have moved to using statx() syscall now (instead of using lstat()). And statx() allows it to query only ctime or mtime (and not rest of the basic stat fields). That means when querying for mtime, fuse_update_get_attr() sees that mtime is not invalid (only atime is invalid). So it does not generate a new GETATTR and fill stat with cached mtime/ctime. And that means updated mtime is not seen by xfstest and tests start failing. Invalidating attrs after writeback completion should solve this problem in a generic manner. Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2021-04-06 22:07:06 +08:00
/*
* A writeback finished and this might have updated mtime/ctime on
* server making local mtime/ctime stale. Hence invalidate attrs.
* Do this only if writeback_cache is not enabled. If writeback_cache
* is enabled, we trust local ctime/mtime.
*/
if (!fc->writeback_cache)
fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY);
spin_lock(&fi->lock);
rb_erase(&wpa->writepages_entry, &fi->writepages);
while (wpa->next) {
struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_write_in *inarg = &wpa->ia.write.in;
struct fuse_writepage_args *next = wpa->next;
wpa->next = next->next;
next->next = NULL;
next->ia.ff = fuse_file_get(wpa->ia.ff);
tree_insert(&fi->writepages, next);
/*
* Skip fuse_flush_writepages() to make it easy to crop requests
* based on primary request size.
*
* 1st case (trivial): there are no concurrent activities using
* fuse_set/release_nowrite. Then we're on safe side because
* fuse_flush_writepages() would call fuse_send_writepage()
* anyway.
*
* 2nd case: someone called fuse_set_nowrite and it is waiting
* now for completion of all in-flight requests. This happens
* rarely and no more than once per page, so this should be
* okay.
*
* 3rd case: someone (e.g. fuse_do_setattr()) is in the middle
* of fuse_set_nowrite..fuse_release_nowrite section. The fact
* that fuse_set_nowrite returned implies that all in-flight
* requests were completed along with all of their secondary
* requests. Further primary requests are blocked by negative
* writectr. Hence there cannot be any in-flight requests and
* no invocations of fuse_writepage_end() while we're in
* fuse_set_nowrite..fuse_release_nowrite section.
*/
fuse_send_writepage(fm, next, inarg->offset + inarg->size);
}
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
fi->writectr--;
fuse_writepage_finish(fm, wpa);
spin_unlock(&fi->lock);
fuse_writepage_free(wpa);
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
}
static struct fuse_file *__fuse_write_file_get(struct fuse_inode *fi)
{
struct fuse_file *ff;
spin_lock(&fi->lock);
ff = list_first_entry_or_null(&fi->write_files, struct fuse_file,
write_entry);
if (ff)
fuse_file_get(ff);
spin_unlock(&fi->lock);
return ff;
}
static struct fuse_file *fuse_write_file_get(struct fuse_inode *fi)
{
struct fuse_file *ff = __fuse_write_file_get(fi);
WARN_ON(!ff);
return ff;
}
int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
{
struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_file *ff;
int err;
/*
* Inode is always written before the last reference is dropped and
* hence this should not be reached from reclaim.
*
* Writing back the inode from reclaim can deadlock if the request
* processing itself needs an allocation. Allocations triggering
* reclaim while serving a request can't be prevented, because it can
* involve any number of unrelated userspace processes.
*/
WARN_ON(wbc->for_reclaim);
ff = __fuse_write_file_get(fi);
err = fuse_flush_times(inode, ff);
if (ff)
fuse_file_put(ff, false, false);
return err;
}
static struct fuse_writepage_args *fuse_writepage_args_alloc(void)
{
struct fuse_writepage_args *wpa;
struct fuse_args_pages *ap;
wpa = kzalloc(sizeof(*wpa), GFP_NOFS);
if (wpa) {
ap = &wpa->ia.ap;
ap->num_pages = 0;
ap->pages = fuse_pages_alloc(1, GFP_NOFS, &ap->descs);
if (!ap->pages) {
kfree(wpa);
wpa = NULL;
}
}
return wpa;
}
static void fuse_writepage_add_to_bucket(struct fuse_conn *fc,
struct fuse_writepage_args *wpa)
{
if (!fc->sync_fs)
return;
rcu_read_lock();
/* Prevent resurrection of dead bucket in unlikely race with syncfs */
do {
wpa->bucket = rcu_dereference(fc->curr_bucket);
} while (unlikely(!atomic_inc_not_zero(&wpa->bucket->count)));
rcu_read_unlock();
}
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
static int fuse_writepage_locked(struct page *page)
{
struct address_space *mapping = page->mapping;
struct inode *inode = mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_writepage_args *wpa;
struct fuse_args_pages *ap;
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
struct page *tmp_page;
int error = -ENOMEM;
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
set_page_writeback(page);
wpa = fuse_writepage_args_alloc();
if (!wpa)
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
goto err;
ap = &wpa->ia.ap;
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
if (!tmp_page)
goto err_free;
error = -EIO;
wpa->ia.ff = fuse_write_file_get(fi);
if (!wpa->ia.ff)
goto err_nofile;
fuse_writepage_add_to_bucket(fc, wpa);
fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0);
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
copy_highpage(tmp_page, page);
wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
wpa->next = NULL;
ap->args.in_pages = true;
ap->num_pages = 1;
ap->pages[0] = tmp_page;
ap->descs[0].offset = 0;
ap->descs[0].length = PAGE_SIZE;
ap->args.end = fuse_writepage_end;
wpa->inode = inode;
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
spin_lock(&fi->lock);
tree_insert(&fi->writepages, wpa);
list_add_tail(&wpa->queue_entry, &fi->queued_writes);
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
fuse_flush_writepages(inode);
spin_unlock(&fi->lock);
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
end_page_writeback(page);
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
return 0;
err_nofile:
__free_page(tmp_page);
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
err_free:
kfree(wpa);
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
err:
mapping_set_error(page->mapping, error);
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
end_page_writeback(page);
return error;
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
}
static int fuse_writepage(struct page *page, struct writeback_control *wbc)
{
fuse: remove reliance on bdi congestion The bdi congestion tracking in not widely used and will be removed. Fuse is one of a small number of filesystems that uses it, setting both the sync (read) and async (write) congestion flags at what it determines are appropriate times. The only remaining effect of the sync flag is to cause read-ahead to be skipped. The only remaining effect of the async flag is to cause (some) WB_SYNC_NONE writes to be skipped. So instead of setting the flags, change: - .readahead to stop when it has submitted all non-async pages for read. - .writepages to do nothing if WB_SYNC_NONE and the flag would be set - .writepage to return AOP_WRITEPAGE_ACTIVATE if WB_SYNC_NONE and the flag would be set. The writepages change causes a behavioural change in that pageout() can now return PAGE_ACTIVATE instead of PAGE_KEEP, so SetPageActive() will be called on the page which (I think) will further delay the next attempt at writeout. This might be a good thing. Link: https://lkml.kernel.org/r/164549983737.9187.2627117501000365074.stgit@noble.brown Signed-off-by: NeilBrown <neilb@suse.de> Cc: Anna Schumaker <Anna.Schumaker@Netapp.com> Cc: Chao Yu <chao@kernel.org> Cc: Darrick J. Wong <djwong@kernel.org> Cc: Ilya Dryomov <idryomov@gmail.com> Cc: Jaegeuk Kim <jaegeuk@kernel.org> Cc: Jan Kara <jack@suse.cz> Cc: Jeff Layton <jlayton@kernel.org> Cc: Jens Axboe <axboe@kernel.dk> Cc: Lars Ellenberg <lars.ellenberg@linbit.com> Cc: Miklos Szeredi <miklos@szeredi.hu> Cc: Paolo Valente <paolo.valente@linaro.org> Cc: Philipp Reisner <philipp.reisner@linbit.com> Cc: Ryusuke Konishi <konishi.ryusuke@gmail.com> Cc: Trond Myklebust <trond.myklebust@hammerspace.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-03-23 05:38:58 +08:00
struct fuse_conn *fc = get_fuse_conn(page->mapping->host);
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
int err;
if (fuse_page_is_writeback(page->mapping->host, page->index)) {
/*
* ->writepages() should be called for sync() and friends. We
* should only get here on direct reclaim and then we are
* allowed to skip a page which is already in flight
*/
WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
redirty_page_for_writepage(wbc, page);
unlock_page(page);
return 0;
}
fuse: remove reliance on bdi congestion The bdi congestion tracking in not widely used and will be removed. Fuse is one of a small number of filesystems that uses it, setting both the sync (read) and async (write) congestion flags at what it determines are appropriate times. The only remaining effect of the sync flag is to cause read-ahead to be skipped. The only remaining effect of the async flag is to cause (some) WB_SYNC_NONE writes to be skipped. So instead of setting the flags, change: - .readahead to stop when it has submitted all non-async pages for read. - .writepages to do nothing if WB_SYNC_NONE and the flag would be set - .writepage to return AOP_WRITEPAGE_ACTIVATE if WB_SYNC_NONE and the flag would be set. The writepages change causes a behavioural change in that pageout() can now return PAGE_ACTIVATE instead of PAGE_KEEP, so SetPageActive() will be called on the page which (I think) will further delay the next attempt at writeout. This might be a good thing. Link: https://lkml.kernel.org/r/164549983737.9187.2627117501000365074.stgit@noble.brown Signed-off-by: NeilBrown <neilb@suse.de> Cc: Anna Schumaker <Anna.Schumaker@Netapp.com> Cc: Chao Yu <chao@kernel.org> Cc: Darrick J. Wong <djwong@kernel.org> Cc: Ilya Dryomov <idryomov@gmail.com> Cc: Jaegeuk Kim <jaegeuk@kernel.org> Cc: Jan Kara <jack@suse.cz> Cc: Jeff Layton <jlayton@kernel.org> Cc: Jens Axboe <axboe@kernel.dk> Cc: Lars Ellenberg <lars.ellenberg@linbit.com> Cc: Miklos Szeredi <miklos@szeredi.hu> Cc: Paolo Valente <paolo.valente@linaro.org> Cc: Philipp Reisner <philipp.reisner@linbit.com> Cc: Ryusuke Konishi <konishi.ryusuke@gmail.com> Cc: Trond Myklebust <trond.myklebust@hammerspace.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-03-23 05:38:58 +08:00
if (wbc->sync_mode == WB_SYNC_NONE &&
fc->num_background >= fc->congestion_threshold)
return AOP_WRITEPAGE_ACTIVATE;
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
err = fuse_writepage_locked(page);
unlock_page(page);
return err;
}
struct fuse_fill_wb_data {
struct fuse_writepage_args *wpa;
struct fuse_file *ff;
struct inode *inode;
struct page **orig_pages;
unsigned int max_pages;
};
static bool fuse_pages_realloc(struct fuse_fill_wb_data *data)
{
struct fuse_args_pages *ap = &data->wpa->ia.ap;
struct fuse_conn *fc = get_fuse_conn(data->inode);
struct page **pages;
struct fuse_page_desc *descs;
unsigned int npages = min_t(unsigned int,
max_t(unsigned int, data->max_pages * 2,
FUSE_DEFAULT_MAX_PAGES_PER_REQ),
fc->max_pages);
WARN_ON(npages <= data->max_pages);
pages = fuse_pages_alloc(npages, GFP_NOFS, &descs);
if (!pages)
return false;
memcpy(pages, ap->pages, sizeof(struct page *) * ap->num_pages);
memcpy(descs, ap->descs, sizeof(struct fuse_page_desc) * ap->num_pages);
kfree(ap->pages);
ap->pages = pages;
ap->descs = descs;
data->max_pages = npages;
return true;
}
static void fuse_writepages_send(struct fuse_fill_wb_data *data)
{
struct fuse_writepage_args *wpa = data->wpa;
struct inode *inode = data->inode;
struct fuse_inode *fi = get_fuse_inode(inode);
int num_pages = wpa->ia.ap.num_pages;
int i;
wpa->ia.ff = fuse_file_get(data->ff);
spin_lock(&fi->lock);
list_add_tail(&wpa->queue_entry, &fi->queued_writes);
fuse_flush_writepages(inode);
spin_unlock(&fi->lock);
for (i = 0; i < num_pages; i++)
end_page_writeback(data->orig_pages[i]);
}
/*
fuse: fix warning in tree_insert() and clean up writepage insertion fuse_writepages_fill() calls tree_insert() with ap->num_pages = 0 which triggers the following warning: WARNING: CPU: 1 PID: 17211 at fs/fuse/file.c:1728 tree_insert+0xab/0xc0 [fuse] RIP: 0010:tree_insert+0xab/0xc0 [fuse] Call Trace: fuse_writepages_fill+0x5da/0x6a0 [fuse] write_cache_pages+0x171/0x470 fuse_writepages+0x8a/0x100 [fuse] do_writepages+0x43/0xe0 Fix up the warning and clean up the code around rb-tree insertion: - Rename tree_insert() to fuse_insert_writeback() and make it return the conflicting entry in case of failure - Re-add tree_insert() as a wrapper around fuse_insert_writeback() - Rename fuse_writepage_in_flight() to fuse_writepage_add() and reverse the meaning of the return value to mean + "true" in case the writepage entry was successfully added + "false" in case it was in-fligt queued on an existing writepage entry's auxiliary list or the existing writepage entry's temporary page updated Switch from fuse_find_writeback() + tree_insert() to fuse_insert_writeback() - Move setting orig_pages to before inserting/updating the entry; this may result in the orig_pages value being discarded later in case of an in-flight request - In case of a new writepage entry use fuse_writepage_add() unconditionally, only set data->wpa if the entry was added. Fixes: 6b2fb79963fb ("fuse: optimize writepages search") Reported-by: kernel test robot <rong.a.chen@intel.com> Original-path-by: Vasily Averin <vvs@virtuozzo.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-07-14 20:45:41 +08:00
* Check under fi->lock if the page is under writeback, and insert it onto the
* rb_tree if not. Otherwise iterate auxiliary write requests, to see if there's
* one already added for a page at this offset. If there's none, then insert
* this new request onto the auxiliary list, otherwise reuse the existing one by
fuse: fix warning in tree_insert() and clean up writepage insertion fuse_writepages_fill() calls tree_insert() with ap->num_pages = 0 which triggers the following warning: WARNING: CPU: 1 PID: 17211 at fs/fuse/file.c:1728 tree_insert+0xab/0xc0 [fuse] RIP: 0010:tree_insert+0xab/0xc0 [fuse] Call Trace: fuse_writepages_fill+0x5da/0x6a0 [fuse] write_cache_pages+0x171/0x470 fuse_writepages+0x8a/0x100 [fuse] do_writepages+0x43/0xe0 Fix up the warning and clean up the code around rb-tree insertion: - Rename tree_insert() to fuse_insert_writeback() and make it return the conflicting entry in case of failure - Re-add tree_insert() as a wrapper around fuse_insert_writeback() - Rename fuse_writepage_in_flight() to fuse_writepage_add() and reverse the meaning of the return value to mean + "true" in case the writepage entry was successfully added + "false" in case it was in-fligt queued on an existing writepage entry's auxiliary list or the existing writepage entry's temporary page updated Switch from fuse_find_writeback() + tree_insert() to fuse_insert_writeback() - Move setting orig_pages to before inserting/updating the entry; this may result in the orig_pages value being discarded later in case of an in-flight request - In case of a new writepage entry use fuse_writepage_add() unconditionally, only set data->wpa if the entry was added. Fixes: 6b2fb79963fb ("fuse: optimize writepages search") Reported-by: kernel test robot <rong.a.chen@intel.com> Original-path-by: Vasily Averin <vvs@virtuozzo.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-07-14 20:45:41 +08:00
* swapping the new temp page with the old one.
*/
fuse: fix warning in tree_insert() and clean up writepage insertion fuse_writepages_fill() calls tree_insert() with ap->num_pages = 0 which triggers the following warning: WARNING: CPU: 1 PID: 17211 at fs/fuse/file.c:1728 tree_insert+0xab/0xc0 [fuse] RIP: 0010:tree_insert+0xab/0xc0 [fuse] Call Trace: fuse_writepages_fill+0x5da/0x6a0 [fuse] write_cache_pages+0x171/0x470 fuse_writepages+0x8a/0x100 [fuse] do_writepages+0x43/0xe0 Fix up the warning and clean up the code around rb-tree insertion: - Rename tree_insert() to fuse_insert_writeback() and make it return the conflicting entry in case of failure - Re-add tree_insert() as a wrapper around fuse_insert_writeback() - Rename fuse_writepage_in_flight() to fuse_writepage_add() and reverse the meaning of the return value to mean + "true" in case the writepage entry was successfully added + "false" in case it was in-fligt queued on an existing writepage entry's auxiliary list or the existing writepage entry's temporary page updated Switch from fuse_find_writeback() + tree_insert() to fuse_insert_writeback() - Move setting orig_pages to before inserting/updating the entry; this may result in the orig_pages value being discarded later in case of an in-flight request - In case of a new writepage entry use fuse_writepage_add() unconditionally, only set data->wpa if the entry was added. Fixes: 6b2fb79963fb ("fuse: optimize writepages search") Reported-by: kernel test robot <rong.a.chen@intel.com> Original-path-by: Vasily Averin <vvs@virtuozzo.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-07-14 20:45:41 +08:00
static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa,
struct page *page)
{
struct fuse_inode *fi = get_fuse_inode(new_wpa->inode);
struct fuse_writepage_args *tmp;
struct fuse_writepage_args *old_wpa;
struct fuse_args_pages *new_ap = &new_wpa->ia.ap;
WARN_ON(new_ap->num_pages != 0);
fuse: fix warning in tree_insert() and clean up writepage insertion fuse_writepages_fill() calls tree_insert() with ap->num_pages = 0 which triggers the following warning: WARNING: CPU: 1 PID: 17211 at fs/fuse/file.c:1728 tree_insert+0xab/0xc0 [fuse] RIP: 0010:tree_insert+0xab/0xc0 [fuse] Call Trace: fuse_writepages_fill+0x5da/0x6a0 [fuse] write_cache_pages+0x171/0x470 fuse_writepages+0x8a/0x100 [fuse] do_writepages+0x43/0xe0 Fix up the warning and clean up the code around rb-tree insertion: - Rename tree_insert() to fuse_insert_writeback() and make it return the conflicting entry in case of failure - Re-add tree_insert() as a wrapper around fuse_insert_writeback() - Rename fuse_writepage_in_flight() to fuse_writepage_add() and reverse the meaning of the return value to mean + "true" in case the writepage entry was successfully added + "false" in case it was in-fligt queued on an existing writepage entry's auxiliary list or the existing writepage entry's temporary page updated Switch from fuse_find_writeback() + tree_insert() to fuse_insert_writeback() - Move setting orig_pages to before inserting/updating the entry; this may result in the orig_pages value being discarded later in case of an in-flight request - In case of a new writepage entry use fuse_writepage_add() unconditionally, only set data->wpa if the entry was added. Fixes: 6b2fb79963fb ("fuse: optimize writepages search") Reported-by: kernel test robot <rong.a.chen@intel.com> Original-path-by: Vasily Averin <vvs@virtuozzo.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-07-14 20:45:41 +08:00
new_ap->num_pages = 1;
spin_lock(&fi->lock);
fuse: fix warning in tree_insert() and clean up writepage insertion fuse_writepages_fill() calls tree_insert() with ap->num_pages = 0 which triggers the following warning: WARNING: CPU: 1 PID: 17211 at fs/fuse/file.c:1728 tree_insert+0xab/0xc0 [fuse] RIP: 0010:tree_insert+0xab/0xc0 [fuse] Call Trace: fuse_writepages_fill+0x5da/0x6a0 [fuse] write_cache_pages+0x171/0x470 fuse_writepages+0x8a/0x100 [fuse] do_writepages+0x43/0xe0 Fix up the warning and clean up the code around rb-tree insertion: - Rename tree_insert() to fuse_insert_writeback() and make it return the conflicting entry in case of failure - Re-add tree_insert() as a wrapper around fuse_insert_writeback() - Rename fuse_writepage_in_flight() to fuse_writepage_add() and reverse the meaning of the return value to mean + "true" in case the writepage entry was successfully added + "false" in case it was in-fligt queued on an existing writepage entry's auxiliary list or the existing writepage entry's temporary page updated Switch from fuse_find_writeback() + tree_insert() to fuse_insert_writeback() - Move setting orig_pages to before inserting/updating the entry; this may result in the orig_pages value being discarded later in case of an in-flight request - In case of a new writepage entry use fuse_writepage_add() unconditionally, only set data->wpa if the entry was added. Fixes: 6b2fb79963fb ("fuse: optimize writepages search") Reported-by: kernel test robot <rong.a.chen@intel.com> Original-path-by: Vasily Averin <vvs@virtuozzo.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-07-14 20:45:41 +08:00
old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa);
if (!old_wpa) {
spin_unlock(&fi->lock);
fuse: fix warning in tree_insert() and clean up writepage insertion fuse_writepages_fill() calls tree_insert() with ap->num_pages = 0 which triggers the following warning: WARNING: CPU: 1 PID: 17211 at fs/fuse/file.c:1728 tree_insert+0xab/0xc0 [fuse] RIP: 0010:tree_insert+0xab/0xc0 [fuse] Call Trace: fuse_writepages_fill+0x5da/0x6a0 [fuse] write_cache_pages+0x171/0x470 fuse_writepages+0x8a/0x100 [fuse] do_writepages+0x43/0xe0 Fix up the warning and clean up the code around rb-tree insertion: - Rename tree_insert() to fuse_insert_writeback() and make it return the conflicting entry in case of failure - Re-add tree_insert() as a wrapper around fuse_insert_writeback() - Rename fuse_writepage_in_flight() to fuse_writepage_add() and reverse the meaning of the return value to mean + "true" in case the writepage entry was successfully added + "false" in case it was in-fligt queued on an existing writepage entry's auxiliary list or the existing writepage entry's temporary page updated Switch from fuse_find_writeback() + tree_insert() to fuse_insert_writeback() - Move setting orig_pages to before inserting/updating the entry; this may result in the orig_pages value being discarded later in case of an in-flight request - In case of a new writepage entry use fuse_writepage_add() unconditionally, only set data->wpa if the entry was added. Fixes: 6b2fb79963fb ("fuse: optimize writepages search") Reported-by: kernel test robot <rong.a.chen@intel.com> Original-path-by: Vasily Averin <vvs@virtuozzo.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-07-14 20:45:41 +08:00
return true;
}
for (tmp = old_wpa->next; tmp; tmp = tmp->next) {
pgoff_t curr_index;
WARN_ON(tmp->inode != new_wpa->inode);
curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT;
if (curr_index == page->index) {
WARN_ON(tmp->ia.ap.num_pages != 1);
swap(tmp->ia.ap.pages[0], new_ap->pages[0]);
break;
}
}
if (!tmp) {
new_wpa->next = old_wpa->next;
old_wpa->next = new_wpa;
}
spin_unlock(&fi->lock);
if (tmp) {
struct backing_dev_info *bdi = inode_to_bdi(new_wpa->inode);
dec_wb_stat(&bdi->wb, WB_WRITEBACK);
dec_node_page_state(new_ap->pages[0], NR_WRITEBACK_TEMP);
wb_writeout_inc(&bdi->wb);
fuse_writepage_free(new_wpa);
}
fuse: fix warning in tree_insert() and clean up writepage insertion fuse_writepages_fill() calls tree_insert() with ap->num_pages = 0 which triggers the following warning: WARNING: CPU: 1 PID: 17211 at fs/fuse/file.c:1728 tree_insert+0xab/0xc0 [fuse] RIP: 0010:tree_insert+0xab/0xc0 [fuse] Call Trace: fuse_writepages_fill+0x5da/0x6a0 [fuse] write_cache_pages+0x171/0x470 fuse_writepages+0x8a/0x100 [fuse] do_writepages+0x43/0xe0 Fix up the warning and clean up the code around rb-tree insertion: - Rename tree_insert() to fuse_insert_writeback() and make it return the conflicting entry in case of failure - Re-add tree_insert() as a wrapper around fuse_insert_writeback() - Rename fuse_writepage_in_flight() to fuse_writepage_add() and reverse the meaning of the return value to mean + "true" in case the writepage entry was successfully added + "false" in case it was in-fligt queued on an existing writepage entry's auxiliary list or the existing writepage entry's temporary page updated Switch from fuse_find_writeback() + tree_insert() to fuse_insert_writeback() - Move setting orig_pages to before inserting/updating the entry; this may result in the orig_pages value being discarded later in case of an in-flight request - In case of a new writepage entry use fuse_writepage_add() unconditionally, only set data->wpa if the entry was added. Fixes: 6b2fb79963fb ("fuse: optimize writepages search") Reported-by: kernel test robot <rong.a.chen@intel.com> Original-path-by: Vasily Averin <vvs@virtuozzo.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-07-14 20:45:41 +08:00
return false;
}
static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page,
struct fuse_args_pages *ap,
struct fuse_fill_wb_data *data)
{
WARN_ON(!ap->num_pages);
/*
* Being under writeback is unlikely but possible. For example direct
* read to an mmaped fuse file will set the page dirty twice; once when
* the pages are faulted with get_user_pages(), and then after the read
* completed.
*/
if (fuse_page_is_writeback(data->inode, page->index))
return true;
/* Reached max pages */
if (ap->num_pages == fc->max_pages)
return true;
/* Reached max write bytes */
if ((ap->num_pages + 1) * PAGE_SIZE > fc->max_write)
return true;
/* Discontinuity */
if (data->orig_pages[ap->num_pages - 1]->index + 1 != page->index)
return true;
/* Need to grow the pages array? If so, did the expansion fail? */
if (ap->num_pages == data->max_pages && !fuse_pages_realloc(data))
return true;
return false;
}
static int fuse_writepages_fill(struct folio *folio,
struct writeback_control *wbc, void *_data)
{
struct fuse_fill_wb_data *data = _data;
struct fuse_writepage_args *wpa = data->wpa;
struct fuse_args_pages *ap = &wpa->ia.ap;
struct inode *inode = data->inode;
struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_conn *fc = get_fuse_conn(inode);
struct page *tmp_page;
int err;
if (!data->ff) {
err = -EIO;
data->ff = fuse_write_file_get(fi);
if (!data->ff)
goto out_unlock;
}
if (wpa && fuse_writepage_need_send(fc, &folio->page, ap, data)) {
fuse_writepages_send(data);
data->wpa = NULL;
}
err = -ENOMEM;
tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
if (!tmp_page)
goto out_unlock;
/*
* The page must not be redirtied until the writeout is completed
* (i.e. userspace has sent a reply to the write request). Otherwise
* there could be more than one temporary page instance for each real
* page.
*
* This is ensured by holding the page lock in page_mkwrite() while
* checking fuse_page_is_writeback(). We already hold the page lock
* since clear_page_dirty_for_io() and keep it held until we add the
* request to the fi->writepages list and increment ap->num_pages.
* After this fuse_page_is_writeback() will indicate that the page is
* under writeback, so we can release the page lock.
*/
if (data->wpa == NULL) {
err = -ENOMEM;
wpa = fuse_writepage_args_alloc();
if (!wpa) {
__free_page(tmp_page);
goto out_unlock;
}
fuse_writepage_add_to_bucket(fc, wpa);
data->max_pages = 1;
ap = &wpa->ia.ap;
fuse_write_args_fill(&wpa->ia, data->ff, folio_pos(folio), 0);
wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
wpa->next = NULL;
ap->args.in_pages = true;
ap->args.end = fuse_writepage_end;
ap->num_pages = 0;
wpa->inode = inode;
}
folio_start_writeback(folio);
copy_highpage(tmp_page, &folio->page);
ap->pages[ap->num_pages] = tmp_page;
ap->descs[ap->num_pages].offset = 0;
ap->descs[ap->num_pages].length = PAGE_SIZE;
data->orig_pages[ap->num_pages] = &folio->page;
inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
err = 0;
fuse: fix warning in tree_insert() and clean up writepage insertion fuse_writepages_fill() calls tree_insert() with ap->num_pages = 0 which triggers the following warning: WARNING: CPU: 1 PID: 17211 at fs/fuse/file.c:1728 tree_insert+0xab/0xc0 [fuse] RIP: 0010:tree_insert+0xab/0xc0 [fuse] Call Trace: fuse_writepages_fill+0x5da/0x6a0 [fuse] write_cache_pages+0x171/0x470 fuse_writepages+0x8a/0x100 [fuse] do_writepages+0x43/0xe0 Fix up the warning and clean up the code around rb-tree insertion: - Rename tree_insert() to fuse_insert_writeback() and make it return the conflicting entry in case of failure - Re-add tree_insert() as a wrapper around fuse_insert_writeback() - Rename fuse_writepage_in_flight() to fuse_writepage_add() and reverse the meaning of the return value to mean + "true" in case the writepage entry was successfully added + "false" in case it was in-fligt queued on an existing writepage entry's auxiliary list or the existing writepage entry's temporary page updated Switch from fuse_find_writeback() + tree_insert() to fuse_insert_writeback() - Move setting orig_pages to before inserting/updating the entry; this may result in the orig_pages value being discarded later in case of an in-flight request - In case of a new writepage entry use fuse_writepage_add() unconditionally, only set data->wpa if the entry was added. Fixes: 6b2fb79963fb ("fuse: optimize writepages search") Reported-by: kernel test robot <rong.a.chen@intel.com> Original-path-by: Vasily Averin <vvs@virtuozzo.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-07-14 20:45:41 +08:00
if (data->wpa) {
/*
* Protected by fi->lock against concurrent access by
* fuse_page_is_writeback().
*/
spin_lock(&fi->lock);
ap->num_pages++;
spin_unlock(&fi->lock);
} else if (fuse_writepage_add(wpa, &folio->page)) {
fuse: fix warning in tree_insert() and clean up writepage insertion fuse_writepages_fill() calls tree_insert() with ap->num_pages = 0 which triggers the following warning: WARNING: CPU: 1 PID: 17211 at fs/fuse/file.c:1728 tree_insert+0xab/0xc0 [fuse] RIP: 0010:tree_insert+0xab/0xc0 [fuse] Call Trace: fuse_writepages_fill+0x5da/0x6a0 [fuse] write_cache_pages+0x171/0x470 fuse_writepages+0x8a/0x100 [fuse] do_writepages+0x43/0xe0 Fix up the warning and clean up the code around rb-tree insertion: - Rename tree_insert() to fuse_insert_writeback() and make it return the conflicting entry in case of failure - Re-add tree_insert() as a wrapper around fuse_insert_writeback() - Rename fuse_writepage_in_flight() to fuse_writepage_add() and reverse the meaning of the return value to mean + "true" in case the writepage entry was successfully added + "false" in case it was in-fligt queued on an existing writepage entry's auxiliary list or the existing writepage entry's temporary page updated Switch from fuse_find_writeback() + tree_insert() to fuse_insert_writeback() - Move setting orig_pages to before inserting/updating the entry; this may result in the orig_pages value being discarded later in case of an in-flight request - In case of a new writepage entry use fuse_writepage_add() unconditionally, only set data->wpa if the entry was added. Fixes: 6b2fb79963fb ("fuse: optimize writepages search") Reported-by: kernel test robot <rong.a.chen@intel.com> Original-path-by: Vasily Averin <vvs@virtuozzo.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-07-14 20:45:41 +08:00
data->wpa = wpa;
} else {
folio_end_writeback(folio);
}
out_unlock:
folio_unlock(folio);
return err;
}
static int fuse_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
fuse: add max_pages to init_out Replace FUSE_MAX_PAGES_PER_REQ with the configurable parameter max_pages to improve performance. Old RFC with detailed description of the problem and many fixes by Mitsuo Hayasaka (mitsuo.hayasaka.hu@hitachi.com): - https://lkml.org/lkml/2012/7/5/136 We've encountered performance degradation and fixed it on a big and complex virtual environment. Environment to reproduce degradation and improvement: 1. Add lag to user mode FUSE Add nanosleep(&(struct timespec){ 0, 1000 }, NULL); to xmp_write_buf in passthrough_fh.c 2. patch UM fuse with configurable max_pages parameter. The patch will be provided latter. 3. run test script and perform test on tmpfs fuse_test() { cd /tmp mkdir -p fusemnt passthrough_fh -o max_pages=$1 /tmp/fusemnt grep fuse /proc/self/mounts dd conv=fdatasync oflag=dsync if=/dev/zero of=fusemnt/tmp/tmp \ count=1K bs=1M 2>&1 | grep -v records rm fusemnt/tmp/tmp killall passthrough_fh } Test results: passthrough_fh /tmp/fusemnt fuse.passthrough_fh \ rw,nosuid,nodev,relatime,user_id=0,group_id=0 0 0 1073741824 bytes (1.1 GB) copied, 1.73867 s, 618 MB/s passthrough_fh /tmp/fusemnt fuse.passthrough_fh \ rw,nosuid,nodev,relatime,user_id=0,group_id=0,max_pages=256 0 0 1073741824 bytes (1.1 GB) copied, 1.15643 s, 928 MB/s Obviously with bigger lag the difference between 'before' and 'after' will be more significant. Mitsuo Hayasaka, in 2012 (https://lkml.org/lkml/2012/7/5/136), observed improvement from 400-550 to 520-740. Signed-off-by: Constantine Shulyupin <const@MakeLinux.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2018-09-06 20:37:06 +08:00
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_fill_wb_data data;
int err;
err = -EIO;
if (fuse_is_bad(inode))
goto out;
fuse: remove reliance on bdi congestion The bdi congestion tracking in not widely used and will be removed. Fuse is one of a small number of filesystems that uses it, setting both the sync (read) and async (write) congestion flags at what it determines are appropriate times. The only remaining effect of the sync flag is to cause read-ahead to be skipped. The only remaining effect of the async flag is to cause (some) WB_SYNC_NONE writes to be skipped. So instead of setting the flags, change: - .readahead to stop when it has submitted all non-async pages for read. - .writepages to do nothing if WB_SYNC_NONE and the flag would be set - .writepage to return AOP_WRITEPAGE_ACTIVATE if WB_SYNC_NONE and the flag would be set. The writepages change causes a behavioural change in that pageout() can now return PAGE_ACTIVATE instead of PAGE_KEEP, so SetPageActive() will be called on the page which (I think) will further delay the next attempt at writeout. This might be a good thing. Link: https://lkml.kernel.org/r/164549983737.9187.2627117501000365074.stgit@noble.brown Signed-off-by: NeilBrown <neilb@suse.de> Cc: Anna Schumaker <Anna.Schumaker@Netapp.com> Cc: Chao Yu <chao@kernel.org> Cc: Darrick J. Wong <djwong@kernel.org> Cc: Ilya Dryomov <idryomov@gmail.com> Cc: Jaegeuk Kim <jaegeuk@kernel.org> Cc: Jan Kara <jack@suse.cz> Cc: Jeff Layton <jlayton@kernel.org> Cc: Jens Axboe <axboe@kernel.dk> Cc: Lars Ellenberg <lars.ellenberg@linbit.com> Cc: Miklos Szeredi <miklos@szeredi.hu> Cc: Paolo Valente <paolo.valente@linaro.org> Cc: Philipp Reisner <philipp.reisner@linbit.com> Cc: Ryusuke Konishi <konishi.ryusuke@gmail.com> Cc: Trond Myklebust <trond.myklebust@hammerspace.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-03-23 05:38:58 +08:00
if (wbc->sync_mode == WB_SYNC_NONE &&
fc->num_background >= fc->congestion_threshold)
return 0;
data.inode = inode;
data.wpa = NULL;
data.ff = NULL;
err = -ENOMEM;
fuse: add max_pages to init_out Replace FUSE_MAX_PAGES_PER_REQ with the configurable parameter max_pages to improve performance. Old RFC with detailed description of the problem and many fixes by Mitsuo Hayasaka (mitsuo.hayasaka.hu@hitachi.com): - https://lkml.org/lkml/2012/7/5/136 We've encountered performance degradation and fixed it on a big and complex virtual environment. Environment to reproduce degradation and improvement: 1. Add lag to user mode FUSE Add nanosleep(&(struct timespec){ 0, 1000 }, NULL); to xmp_write_buf in passthrough_fh.c 2. patch UM fuse with configurable max_pages parameter. The patch will be provided latter. 3. run test script and perform test on tmpfs fuse_test() { cd /tmp mkdir -p fusemnt passthrough_fh -o max_pages=$1 /tmp/fusemnt grep fuse /proc/self/mounts dd conv=fdatasync oflag=dsync if=/dev/zero of=fusemnt/tmp/tmp \ count=1K bs=1M 2>&1 | grep -v records rm fusemnt/tmp/tmp killall passthrough_fh } Test results: passthrough_fh /tmp/fusemnt fuse.passthrough_fh \ rw,nosuid,nodev,relatime,user_id=0,group_id=0 0 0 1073741824 bytes (1.1 GB) copied, 1.73867 s, 618 MB/s passthrough_fh /tmp/fusemnt fuse.passthrough_fh \ rw,nosuid,nodev,relatime,user_id=0,group_id=0,max_pages=256 0 0 1073741824 bytes (1.1 GB) copied, 1.15643 s, 928 MB/s Obviously with bigger lag the difference between 'before' and 'after' will be more significant. Mitsuo Hayasaka, in 2012 (https://lkml.org/lkml/2012/7/5/136), observed improvement from 400-550 to 520-740. Signed-off-by: Constantine Shulyupin <const@MakeLinux.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2018-09-06 20:37:06 +08:00
data.orig_pages = kcalloc(fc->max_pages,
sizeof(struct page *),
GFP_NOFS);
if (!data.orig_pages)
goto out;
err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
if (data.wpa) {
WARN_ON(!data.wpa->ia.ap.num_pages);
fuse_writepages_send(&data);
}
if (data.ff)
fuse_file_put(data.ff, false, false);
kfree(data.orig_pages);
out:
return err;
}
/*
* It's worthy to make sure that space is reserved on disk for the write,
* but how to implement it without killing performance need more thinking.
*/
static int fuse_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, struct page **pagep, void **fsdata)
{
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
pgoff_t index = pos >> PAGE_SHIFT;
struct fuse_conn *fc = get_fuse_conn(file_inode(file));
struct page *page;
loff_t fsize;
int err = -ENOMEM;
WARN_ON(!fc->writeback_cache);
page = grab_cache_page_write_begin(mapping, index);
if (!page)
goto error;
fuse_wait_on_page_writeback(mapping->host, page->index);
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
if (PageUptodate(page) || len == PAGE_SIZE)
goto success;
/*
* Check if the start this page comes after the end of file, in which
* case the readpage can be optimized away.
*/
fsize = i_size_read(mapping->host);
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
if (fsize <= (pos & PAGE_MASK)) {
size_t off = pos & ~PAGE_MASK;
if (off)
zero_user_segment(page, 0, off);
goto success;
}
err = fuse_do_readpage(file, page);
if (err)
goto cleanup;
success:
*pagep = page;
return 0;
cleanup:
unlock_page(page);
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
put_page(page);
error:
return err;
}
static int fuse_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
struct inode *inode = page->mapping->host;
/* Haven't copied anything? Skip zeroing, size extending, dirtying. */
if (!copied)
goto unlock;
pos += copied;
if (!PageUptodate(page)) {
/* Zero any unwritten bytes at the end of the page */
size_t endoff = pos & ~PAGE_MASK;
if (endoff)
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
zero_user_segment(page, endoff, PAGE_SIZE);
SetPageUptodate(page);
}
if (pos > inode->i_size)
i_size_write(inode, pos);
set_page_dirty(page);
unlock:
unlock_page(page);
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
put_page(page);
return copied;
}
static int fuse_launder_folio(struct folio *folio)
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
{
int err = 0;
if (folio_clear_dirty_for_io(folio)) {
struct inode *inode = folio->mapping->host;
/* Serialize with pending writeback for the same page */
fuse_wait_on_page_writeback(inode, folio->index);
err = fuse_writepage_locked(&folio->page);
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
if (!err)
fuse_wait_on_page_writeback(inode, folio->index);
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
}
return err;
}
/*
* Write back dirty data/metadata now (there may not be any suitable
* open files later for data)
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
*/
static void fuse_vma_close(struct vm_area_struct *vma)
{
int err;
err = write_inode_now(vma->vm_file->f_mapping->host, 1);
mapping_set_error(vma->vm_file->f_mapping, err);
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
}
/*
* Wait for writeback against this page to complete before allowing it
* to be marked dirty again, and hence written back again, possibly
* before the previous writepage completed.
*
* Block here, instead of in ->writepage(), so that the userspace fs
* can only block processes actually operating on the filesystem.
*
* Otherwise unprivileged userspace fs would be able to block
* unrelated:
*
* - page migration
* - sync(2)
* - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
*/
static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf)
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
{
struct page *page = vmf->page;
struct inode *inode = file_inode(vmf->vma->vm_file);
file_update_time(vmf->vma->vm_file);
lock_page(page);
if (page->mapping != inode->i_mapping) {
unlock_page(page);
return VM_FAULT_NOPAGE;
}
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
fuse_wait_on_page_writeback(inode, page->index);
return VM_FAULT_LOCKED;
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
}
static const struct vm_operations_struct fuse_file_vm_ops = {
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
.close = fuse_vma_close,
.fault = filemap_fault,
.map_pages = filemap_map_pages,
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
.page_mkwrite = fuse_page_mkwrite,
};
static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = ff->fm->fc;
/* DAX mmap is superior to direct_io mmap */
if (FUSE_IS_DAX(file_inode(file)))
return fuse_dax_mmap(file, vma);
if (ff->open_flags & FOPEN_DIRECT_IO) {
/*
* Can't provide the coherency needed for MAP_SHARED
* if FUSE_DIRECT_IO_ALLOW_MMAP isn't set.
*/
if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_allow_mmap)
return -ENODEV;
invalidate_inode_pages2(file->f_mapping);
if (!(vma->vm_flags & VM_MAYSHARE)) {
/* MAP_PRIVATE */
return generic_file_mmap(file, vma);
}
}
if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
fuse_link_write_file(file);
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
file_accessed(file);
vma->vm_ops = &fuse_file_vm_ops;
return 0;
}
static int convert_fuse_file_lock(struct fuse_conn *fc,
const struct fuse_file_lock *ffl,
struct file_lock *fl)
{
switch (ffl->type) {
case F_UNLCK:
break;
case F_RDLCK:
case F_WRLCK:
if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX ||
ffl->end < ffl->start)
return -EIO;
fl->fl_start = ffl->start;
fl->fl_end = ffl->end;
/*
fs/locks: Remove fl_nspid and use fs-specific l_pid for remote locks Since commit c69899a17ca4 "NFSv4: Update of VFS byte range lock must be atomic with the stateid update", NFSv4 has been inserting locks in rpciod worker context. The result is that the file_lock's fl_nspid is the kworker's pid instead of the original userspace pid. The fl_nspid is only used to represent the namespaced virtual pid number when displaying locks or returning from F_GETLK. There's no reason to set it for every inserted lock, since we can usually just look it up from fl_pid. So, instead of looking up and holding struct pid for every lock, let's just look up the virtual pid number from fl_pid when it is needed. That means we can remove fl_nspid entirely. The translaton and presentation of fl_pid should handle the following four cases: 1 - F_GETLK on a remote file with a remote lock: In this case, the filesystem should determine the l_pid to return here. Filesystems should indicate that the fl_pid represents a non-local pid value that should not be translated by returning an fl_pid <= 0. 2 - F_GETLK on a local file with a remote lock: This should be the l_pid of the lock manager process, and translated. 3 - F_GETLK on a remote file with a local lock, and 4 - F_GETLK on a local file with a local lock: These should be the translated l_pid of the local locking process. Fuse was already doing the correct thing by translating the pid into the caller's namespace. With this change we must update fuse to translate to init's pid namespace, so that the locks API can then translate from init's pid namespace into the pid namespace of the caller. With this change, the locks API will expect that if a filesystem returns a remote pid as opposed to a local pid for F_GETLK, that remote pid will be <= 0. This signifies that the pid is remote, and the locks API will forego translating that pid into the pid namespace of the local calling process. Finally, we convert remote filesystems to present remote pids using negative numbers. Have lustre, 9p, ceph, cifs, and dlm negate the remote pid returned for F_GETLK lock requests. Since local pids will never be larger than PID_MAX_LIMIT (which is currently defined as <= 4 million), but pid_t is an unsigned int, we should have plenty of room to represent remote pids with negative numbers if we assume that remote pid numbers are similarly limited. If this is not the case, then we run the risk of having a remote pid returned for which there is also a corresponding local pid. This is a problem we have now, but this patch should reduce the chances of that occurring, while also returning those remote pid numbers, for whatever that may be worth. Signed-off-by: Benjamin Coddington <bcodding@redhat.com> Signed-off-by: Jeff Layton <jlayton@redhat.com>
2017-07-16 22:28:22 +08:00
* Convert pid into init's pid namespace. The locks API will
* translate it into the caller's pid namespace.
*/
rcu_read_lock();
fs/locks: Remove fl_nspid and use fs-specific l_pid for remote locks Since commit c69899a17ca4 "NFSv4: Update of VFS byte range lock must be atomic with the stateid update", NFSv4 has been inserting locks in rpciod worker context. The result is that the file_lock's fl_nspid is the kworker's pid instead of the original userspace pid. The fl_nspid is only used to represent the namespaced virtual pid number when displaying locks or returning from F_GETLK. There's no reason to set it for every inserted lock, since we can usually just look it up from fl_pid. So, instead of looking up and holding struct pid for every lock, let's just look up the virtual pid number from fl_pid when it is needed. That means we can remove fl_nspid entirely. The translaton and presentation of fl_pid should handle the following four cases: 1 - F_GETLK on a remote file with a remote lock: In this case, the filesystem should determine the l_pid to return here. Filesystems should indicate that the fl_pid represents a non-local pid value that should not be translated by returning an fl_pid <= 0. 2 - F_GETLK on a local file with a remote lock: This should be the l_pid of the lock manager process, and translated. 3 - F_GETLK on a remote file with a local lock, and 4 - F_GETLK on a local file with a local lock: These should be the translated l_pid of the local locking process. Fuse was already doing the correct thing by translating the pid into the caller's namespace. With this change we must update fuse to translate to init's pid namespace, so that the locks API can then translate from init's pid namespace into the pid namespace of the caller. With this change, the locks API will expect that if a filesystem returns a remote pid as opposed to a local pid for F_GETLK, that remote pid will be <= 0. This signifies that the pid is remote, and the locks API will forego translating that pid into the pid namespace of the local calling process. Finally, we convert remote filesystems to present remote pids using negative numbers. Have lustre, 9p, ceph, cifs, and dlm negate the remote pid returned for F_GETLK lock requests. Since local pids will never be larger than PID_MAX_LIMIT (which is currently defined as <= 4 million), but pid_t is an unsigned int, we should have plenty of room to represent remote pids with negative numbers if we assume that remote pid numbers are similarly limited. If this is not the case, then we run the risk of having a remote pid returned for which there is also a corresponding local pid. This is a problem we have now, but this patch should reduce the chances of that occurring, while also returning those remote pid numbers, for whatever that may be worth. Signed-off-by: Benjamin Coddington <bcodding@redhat.com> Signed-off-by: Jeff Layton <jlayton@redhat.com>
2017-07-16 22:28:22 +08:00
fl->fl_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns);
rcu_read_unlock();
break;
default:
return -EIO;
}
fl->fl_type = ffl->type;
return 0;
}
static void fuse_lk_fill(struct fuse_args *args, struct file *file,
const struct file_lock *fl, int opcode, pid_t pid,
int flock, struct fuse_lk_in *inarg)
{
struct inode *inode = file_inode(file);
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_file *ff = file->private_data;
memset(inarg, 0, sizeof(*inarg));
inarg->fh = ff->fh;
inarg->owner = fuse_lock_owner_id(fc, fl->fl_owner);
inarg->lk.start = fl->fl_start;
inarg->lk.end = fl->fl_end;
inarg->lk.type = fl->fl_type;
inarg->lk.pid = pid;
if (flock)
inarg->lk_flags |= FUSE_LK_FLOCK;
args->opcode = opcode;
args->nodeid = get_node_id(inode);
args->in_numargs = 1;
args->in_args[0].size = sizeof(*inarg);
args->in_args[0].value = inarg;
}
static int fuse_getlk(struct file *file, struct file_lock *fl)
{
struct inode *inode = file_inode(file);
struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_lk_in inarg;
struct fuse_lk_out outarg;
int err;
fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg);
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
err = fuse_simple_request(fm, &args);
if (!err)
err = convert_fuse_file_lock(fm->fc, &outarg.lk, fl);
return err;
}
static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
{
struct inode *inode = file_inode(file);
struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_lk_in inarg;
int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL;
pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns);
int err;
if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
/* NLM needs asynchronous locks, which we don't support yet */
return -ENOLCK;
}
/* Unlock on close is handled by the flush method */
if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX)
return 0;
fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg);
err = fuse_simple_request(fm, &args);
/* locking is restartable */
if (err == -EINTR)
err = -ERESTARTSYS;
return err;
}
static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
{
struct inode *inode = file_inode(file);
struct fuse_conn *fc = get_fuse_conn(inode);
int err;
if (cmd == F_CANCELLK) {
err = 0;
} else if (cmd == F_GETLK) {
if (fc->no_lock) {
posix_test_lock(file, fl);
err = 0;
} else
err = fuse_getlk(file, fl);
} else {
if (fc->no_lock)
err = posix_lock_file(file, fl, NULL);
else
err = fuse_setlk(file, fl, 0);
}
return err;
}
static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
{
struct inode *inode = file_inode(file);
struct fuse_conn *fc = get_fuse_conn(inode);
int err;
if (fc->no_flock) {
err = locks_lock_file_wait(file, fl);
} else {
struct fuse_file *ff = file->private_data;
/* emulate flock with POSIX locks */
ff->flock = true;
err = fuse_setlk(file, fl, 1);
}
return err;
}
static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
{
struct inode *inode = mapping->host;
struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
struct fuse_bmap_in inarg;
struct fuse_bmap_out outarg;
int err;
if (!inode->i_sb->s_bdev || fm->fc->no_bmap)
return 0;
memset(&inarg, 0, sizeof(inarg));
inarg.block = block;
inarg.blocksize = inode->i_sb->s_blocksize;
args.opcode = FUSE_BMAP;
args.nodeid = get_node_id(inode);
args.in_numargs = 1;
args.in_args[0].size = sizeof(inarg);
args.in_args[0].value = &inarg;
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
err = fuse_simple_request(fm, &args);
if (err == -ENOSYS)
fm->fc->no_bmap = 1;
return err ? 0 : outarg.block;
}
static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
struct fuse_mount *fm = get_fuse_mount(inode);
struct fuse_file *ff = file->private_data;
FUSE_ARGS(args);
struct fuse_lseek_in inarg = {
.fh = ff->fh,
.offset = offset,
.whence = whence
};
struct fuse_lseek_out outarg;
int err;
if (fm->fc->no_lseek)
goto fallback;
args.opcode = FUSE_LSEEK;
args.nodeid = ff->nodeid;
args.in_numargs = 1;
args.in_args[0].size = sizeof(inarg);
args.in_args[0].value = &inarg;
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
err = fuse_simple_request(fm, &args);
if (err) {
if (err == -ENOSYS) {
fm->fc->no_lseek = 1;
goto fallback;
}
return err;
}
return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes);
fallback:
err = fuse_update_attributes(inode, file, STATX_SIZE);
if (!err)
return generic_file_llseek(file, offset, whence);
else
return err;
}
static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
{
loff_t retval;
struct inode *inode = file_inode(file);
switch (whence) {
case SEEK_SET:
case SEEK_CUR:
/* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
retval = generic_file_llseek(file, offset, whence);
break;
case SEEK_END:
inode_lock(inode);
retval = fuse_update_attributes(inode, file, STATX_SIZE);
if (!retval)
retval = generic_file_llseek(file, offset, whence);
inode_unlock(inode);
break;
case SEEK_HOLE:
case SEEK_DATA:
inode_lock(inode);
retval = fuse_lseek(file, offset, whence);
inode_unlock(inode);
break;
default:
retval = -EINVAL;
}
return retval;
}
/*
* All files which have been polled are linked to RB tree
* fuse_conn->polled_files which is indexed by kh. Walk the tree and
* find the matching one.
*/
static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh,
struct rb_node **parent_out)
{
struct rb_node **link = &fc->polled_files.rb_node;
struct rb_node *last = NULL;
while (*link) {
struct fuse_file *ff;
last = *link;
ff = rb_entry(last, struct fuse_file, polled_node);
if (kh < ff->kh)
link = &last->rb_left;
else if (kh > ff->kh)
link = &last->rb_right;
else
return link;
}
if (parent_out)
*parent_out = last;
return link;
}
/*
* The file is about to be polled. Make sure it's on the polled_files
* RB tree. Note that files once added to the polled_files tree are
* not removed before the file is released. This is because a file
* polled once is likely to be polled again.
*/
static void fuse_register_polled_file(struct fuse_conn *fc,
struct fuse_file *ff)
{
spin_lock(&fc->lock);
if (RB_EMPTY_NODE(&ff->polled_node)) {
treewide: Remove uninitialized_var() usage Using uninitialized_var() is dangerous as it papers over real bugs[1] (or can in the future), and suppresses unrelated compiler warnings (e.g. "unused variable"). If the compiler thinks it is uninitialized, either simply initialize the variable or make compiler changes. In preparation for removing[2] the[3] macro[4], remove all remaining needless uses with the following script: git grep '\buninitialized_var\b' | cut -d: -f1 | sort -u | \ xargs perl -pi -e \ 's/\buninitialized_var\(([^\)]+)\)/\1/g; s:\s*/\* (GCC be quiet|to make compiler happy) \*/$::g;' drivers/video/fbdev/riva/riva_hw.c was manually tweaked to avoid pathological white-space. No outstanding warnings were found building allmodconfig with GCC 9.3.0 for x86_64, i386, arm64, arm, powerpc, powerpc64le, s390x, mips, sparc64, alpha, and m68k. [1] https://lore.kernel.org/lkml/20200603174714.192027-1-glider@google.com/ [2] https://lore.kernel.org/lkml/CA+55aFw+Vbj0i=1TGqCR5vQkCzWJ0QxK6CernOU6eedsudAixw@mail.gmail.com/ [3] https://lore.kernel.org/lkml/CA+55aFwgbgqhbp1fkxvRKEpzyR5J8n1vKT1VZdz9knmPuXhOeg@mail.gmail.com/ [4] https://lore.kernel.org/lkml/CA+55aFz2500WfbKXAx8s67wrm9=yVJu65TpLgN_ybYNv0VEOKA@mail.gmail.com/ Reviewed-by: Leon Romanovsky <leonro@mellanox.com> # drivers/infiniband and mlx4/mlx5 Acked-by: Jason Gunthorpe <jgg@mellanox.com> # IB Acked-by: Kalle Valo <kvalo@codeaurora.org> # wireless drivers Reviewed-by: Chao Yu <yuchao0@huawei.com> # erofs Signed-off-by: Kees Cook <keescook@chromium.org>
2020-06-04 04:09:38 +08:00
struct rb_node **link, *parent;
link = fuse_find_polled_node(fc, ff->kh, &parent);
BUG_ON(*link);
rb_link_node(&ff->polled_node, parent, link);
rb_insert_color(&ff->polled_node, &fc->polled_files);
}
spin_unlock(&fc->lock);
}
__poll_t fuse_file_poll(struct file *file, poll_table *wait)
{
struct fuse_file *ff = file->private_data;
struct fuse_mount *fm = ff->fm;
struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
struct fuse_poll_out outarg;
FUSE_ARGS(args);
int err;
if (fm->fc->no_poll)
return DEFAULT_POLLMASK;
poll_wait(file, &ff->poll_wait, wait);
inarg.events = mangle_poll(poll_requested_events(wait));
/*
* Ask for notification iff there's someone waiting for it.
* The client may ignore the flag and always notify.
*/
if (waitqueue_active(&ff->poll_wait)) {
inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
fuse_register_polled_file(fm->fc, ff);
}
args.opcode = FUSE_POLL;
args.nodeid = ff->nodeid;
args.in_numargs = 1;
args.in_args[0].size = sizeof(inarg);
args.in_args[0].value = &inarg;
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
err = fuse_simple_request(fm, &args);
if (!err)
return demangle_poll(outarg.revents);
if (err == -ENOSYS) {
fm->fc->no_poll = 1;
return DEFAULT_POLLMASK;
}
return EPOLLERR;
}
EXPORT_SYMBOL_GPL(fuse_file_poll);
/*
* This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
* wakes up the poll waiters.
*/
int fuse_notify_poll_wakeup(struct fuse_conn *fc,
struct fuse_notify_poll_wakeup_out *outarg)
{
u64 kh = outarg->kh;
struct rb_node **link;
spin_lock(&fc->lock);
link = fuse_find_polled_node(fc, kh, NULL);
if (*link) {
struct fuse_file *ff;
ff = rb_entry(*link, struct fuse_file, polled_node);
wake_up_interruptible_sync(&ff->poll_wait);
}
spin_unlock(&fc->lock);
return 0;
}
static void fuse_do_truncate(struct file *file)
{
struct inode *inode = file->f_mapping->host;
struct iattr attr;
attr.ia_valid = ATTR_SIZE;
attr.ia_size = i_size_read(inode);
attr.ia_file = file;
attr.ia_valid |= ATTR_FILE;
fuse_do_setattr(file_dentry(file), &attr, file);
}
fuse: add max_pages to init_out Replace FUSE_MAX_PAGES_PER_REQ with the configurable parameter max_pages to improve performance. Old RFC with detailed description of the problem and many fixes by Mitsuo Hayasaka (mitsuo.hayasaka.hu@hitachi.com): - https://lkml.org/lkml/2012/7/5/136 We've encountered performance degradation and fixed it on a big and complex virtual environment. Environment to reproduce degradation and improvement: 1. Add lag to user mode FUSE Add nanosleep(&(struct timespec){ 0, 1000 }, NULL); to xmp_write_buf in passthrough_fh.c 2. patch UM fuse with configurable max_pages parameter. The patch will be provided latter. 3. run test script and perform test on tmpfs fuse_test() { cd /tmp mkdir -p fusemnt passthrough_fh -o max_pages=$1 /tmp/fusemnt grep fuse /proc/self/mounts dd conv=fdatasync oflag=dsync if=/dev/zero of=fusemnt/tmp/tmp \ count=1K bs=1M 2>&1 | grep -v records rm fusemnt/tmp/tmp killall passthrough_fh } Test results: passthrough_fh /tmp/fusemnt fuse.passthrough_fh \ rw,nosuid,nodev,relatime,user_id=0,group_id=0 0 0 1073741824 bytes (1.1 GB) copied, 1.73867 s, 618 MB/s passthrough_fh /tmp/fusemnt fuse.passthrough_fh \ rw,nosuid,nodev,relatime,user_id=0,group_id=0,max_pages=256 0 0 1073741824 bytes (1.1 GB) copied, 1.15643 s, 928 MB/s Obviously with bigger lag the difference between 'before' and 'after' will be more significant. Mitsuo Hayasaka, in 2012 (https://lkml.org/lkml/2012/7/5/136), observed improvement from 400-550 to 520-740. Signed-off-by: Constantine Shulyupin <const@MakeLinux.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2018-09-06 20:37:06 +08:00
static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off)
{
fuse: add max_pages to init_out Replace FUSE_MAX_PAGES_PER_REQ with the configurable parameter max_pages to improve performance. Old RFC with detailed description of the problem and many fixes by Mitsuo Hayasaka (mitsuo.hayasaka.hu@hitachi.com): - https://lkml.org/lkml/2012/7/5/136 We've encountered performance degradation and fixed it on a big and complex virtual environment. Environment to reproduce degradation and improvement: 1. Add lag to user mode FUSE Add nanosleep(&(struct timespec){ 0, 1000 }, NULL); to xmp_write_buf in passthrough_fh.c 2. patch UM fuse with configurable max_pages parameter. The patch will be provided latter. 3. run test script and perform test on tmpfs fuse_test() { cd /tmp mkdir -p fusemnt passthrough_fh -o max_pages=$1 /tmp/fusemnt grep fuse /proc/self/mounts dd conv=fdatasync oflag=dsync if=/dev/zero of=fusemnt/tmp/tmp \ count=1K bs=1M 2>&1 | grep -v records rm fusemnt/tmp/tmp killall passthrough_fh } Test results: passthrough_fh /tmp/fusemnt fuse.passthrough_fh \ rw,nosuid,nodev,relatime,user_id=0,group_id=0 0 0 1073741824 bytes (1.1 GB) copied, 1.73867 s, 618 MB/s passthrough_fh /tmp/fusemnt fuse.passthrough_fh \ rw,nosuid,nodev,relatime,user_id=0,group_id=0,max_pages=256 0 0 1073741824 bytes (1.1 GB) copied, 1.15643 s, 928 MB/s Obviously with bigger lag the difference between 'before' and 'after' will be more significant. Mitsuo Hayasaka, in 2012 (https://lkml.org/lkml/2012/7/5/136), observed improvement from 400-550 to 520-740. Signed-off-by: Constantine Shulyupin <const@MakeLinux.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2018-09-06 20:37:06 +08:00
return round_up(off, fc->max_pages << PAGE_SHIFT);
}
static ssize_t
fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
DECLARE_COMPLETION_ONSTACK(wait);
ssize_t ret = 0;
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
loff_t pos = 0;
struct inode *inode;
loff_t i_size;
size_t count = iov_iter_count(iter), shortened = 0;
loff_t offset = iocb->ki_pos;
struct fuse_io_priv *io;
pos = offset;
inode = file->f_mapping->host;
i_size = i_size_read(inode);
if ((iov_iter_rw(iter) == READ) && (offset >= i_size))
Fix race when checking i_size on direct i/o read So far I've had one ACK for this, and no other comments. So I think it is probably time to send this via some suitable tree. I'm guessing that the vfs tree would be the most appropriate route, but not sure that there is one at the moment (don't see anything recent at kernel.org) so in that case I think -mm is the "back up plan". Al, please let me know if you will take this? Steve. --------------------- Following on from the "Re: [PATCH v3] vfs: fix a bug when we do some dio reads with append dio writes" thread on linux-fsdevel, this patch is my current version of the fix proposed as option (b) in that thread. Removing the i_size test from the direct i/o read path at vfs level means that filesystems now have to deal with requests which are beyond i_size themselves. These I've divided into three sets: a) Those with "no op" ->direct_IO (9p, cifs, ceph) These are obviously not going to be an issue b) Those with "home brew" ->direct_IO (nfs, fuse) I've been told that NFS should not have any problem with the larger i_size, however I've added an extra test to FUSE to duplicate the original behaviour just to be on the safe side. c) Those using __blockdev_direct_IO() These call through to ->get_block() which should deal with the EOF condition correctly. I've verified that with GFS2 and I believe that Zheng has verified it for ext4. I've also run the test on XFS and it passes both before and after this change. The part of the patch in filemap.c looks a lot larger than it really is - there are only two lines of real change. The rest is just indentation of the contained code. There remains a test of i_size though, which was added for btrfs. It doesn't cause the other filesystems a problem as the test is performed after ->direct_IO has been called. It is possible that there is a race that does matter to btrfs, however this patch doesn't change that, so its still an overall improvement. Signed-off-by: Steven Whitehouse <swhiteho@redhat.com> Reported-by: Zheng Liu <gnehzuil.liu@gmail.com> Cc: Jan Kara <jack@suse.cz> Cc: Dave Chinner <david@fromorbit.com> Acked-by: Miklos Szeredi <miklos@szeredi.hu> Cc: Chris Mason <clm@fb.com> Cc: Josef Bacik <jbacik@fb.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-01-24 22:42:22 +08:00
return 0;
io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
if (!io)
return -ENOMEM;
spin_lock_init(&io->lock);
kref_init(&io->refcnt);
io->reqs = 1;
io->bytes = -1;
io->size = 0;
io->offset = offset;
io->write = (iov_iter_rw(iter) == WRITE);
io->err = 0;
/*
* By default, we want to optimize all I/Os with async request
* submission to the client filesystem if supported.
*/
fuse update for 5.10 -----BEGIN PGP SIGNATURE----- iHUEABYIAB0WIQSQHSd0lITzzeNWNm3h3BK/laaZPAUCX4n0/gAKCRDh3BK/laaZ PM3jAP4xhaix0j/y3VyaxsUqWg6ZSrjq6X0o9clGMJv27IAtjgD/fJ7ZwzTldojD qb7N3utjLiPVRjwFmvsZ8JZ7O7PbwQ0= =oUbZ -----END PGP SIGNATURE----- Merge tag 'fuse-update-5.10' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse Pull fuse updates from Miklos Szeredi: - Support directly accessing host page cache from virtiofs. This can improve I/O performance for various workloads, as well as reducing the memory requirement by eliminating double caching. Thanks to Vivek Goyal for doing most of the work on this. - Allow automatic submounting inside virtiofs. This allows unique st_dev/ st_ino values to be assigned inside the guest to files residing on different filesystems on the host. Thanks to Max Reitz for the patches. - Fix an old use after free bug found by Pradeep P V K. * tag 'fuse-update-5.10' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse: (25 commits) virtiofs: calculate number of scatter-gather elements accurately fuse: connection remove fix fuse: implement crossmounts fuse: Allow fuse_fill_super_common() for submounts fuse: split fuse_mount off of fuse_conn fuse: drop fuse_conn parameter where possible fuse: store fuse_conn in fuse_req fuse: add submount support to <uapi/linux/fuse.h> fuse: fix page dereference after free virtiofs: add logic to free up a memory range virtiofs: maintain a list of busy elements virtiofs: serialize truncate/punch_hole and dax fault path virtiofs: define dax address space operations virtiofs: add DAX mmap support virtiofs: implement dax read/write operations virtiofs: introduce setupmapping/removemapping commands virtiofs: implement FUSE_INIT map_alignment field virtiofs: keep a list of free dax memory ranges virtiofs: add a mount option to enable dax virtiofs: set up virtio_fs dax_device ...
2020-10-20 05:28:30 +08:00
io->async = ff->fm->fc->async_dio;
io->iocb = iocb;
io->blocking = is_sync_kiocb(iocb);
/* optimization for short read */
if (io->async && !io->write && offset + count > i_size) {
fuse update for 5.10 -----BEGIN PGP SIGNATURE----- iHUEABYIAB0WIQSQHSd0lITzzeNWNm3h3BK/laaZPAUCX4n0/gAKCRDh3BK/laaZ PM3jAP4xhaix0j/y3VyaxsUqWg6ZSrjq6X0o9clGMJv27IAtjgD/fJ7ZwzTldojD qb7N3utjLiPVRjwFmvsZ8JZ7O7PbwQ0= =oUbZ -----END PGP SIGNATURE----- Merge tag 'fuse-update-5.10' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse Pull fuse updates from Miklos Szeredi: - Support directly accessing host page cache from virtiofs. This can improve I/O performance for various workloads, as well as reducing the memory requirement by eliminating double caching. Thanks to Vivek Goyal for doing most of the work on this. - Allow automatic submounting inside virtiofs. This allows unique st_dev/ st_ino values to be assigned inside the guest to files residing on different filesystems on the host. Thanks to Max Reitz for the patches. - Fix an old use after free bug found by Pradeep P V K. * tag 'fuse-update-5.10' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse: (25 commits) virtiofs: calculate number of scatter-gather elements accurately fuse: connection remove fix fuse: implement crossmounts fuse: Allow fuse_fill_super_common() for submounts fuse: split fuse_mount off of fuse_conn fuse: drop fuse_conn parameter where possible fuse: store fuse_conn in fuse_req fuse: add submount support to <uapi/linux/fuse.h> fuse: fix page dereference after free virtiofs: add logic to free up a memory range virtiofs: maintain a list of busy elements virtiofs: serialize truncate/punch_hole and dax fault path virtiofs: define dax address space operations virtiofs: add DAX mmap support virtiofs: implement dax read/write operations virtiofs: introduce setupmapping/removemapping commands virtiofs: implement FUSE_INIT map_alignment field virtiofs: keep a list of free dax memory ranges virtiofs: add a mount option to enable dax virtiofs: set up virtio_fs dax_device ...
2020-10-20 05:28:30 +08:00
iov_iter_truncate(iter, fuse_round_up(ff->fm->fc, i_size - offset));
shortened = count - iov_iter_count(iter);
count -= shortened;
}
/*
* We cannot asynchronously extend the size of a file.
* In such case the aio will behave exactly like sync io.
*/
if ((offset + count > i_size) && io->write)
io->blocking = true;
if (io->async && io->blocking) {
/*
* Additional reference to keep io around after
* calling fuse_aio_complete()
*/
kref_get(&io->refcnt);
io->done = &wait;
}
if (iov_iter_rw(iter) == WRITE) {
ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE);
fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
} else {
ret = __fuse_direct_read(io, iter, &pos);
}
iov_iter_reexpand(iter, iov_iter_count(iter) + shortened);
if (io->async) {
bool blocking = io->blocking;
fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
/* we have a non-extending, async request, so return */
if (!blocking)
return -EIOCBQUEUED;
wait_for_completion(&wait);
ret = fuse_get_res_by_io(io);
}
kref_put(&io->refcnt, fuse_io_release);
if (iov_iter_rw(iter) == WRITE) {
fuse_write_update_attr(inode, pos, ret);
fuse: allow non-extending parallel direct writes on the same file In general, as of now, in FUSE, direct writes on the same file are serialized over inode lock i.e we hold inode lock for the full duration of the write request. I could not find in fuse code and git history a comment which clearly explains why this exclusive lock is taken for direct writes. Following might be the reasons for acquiring an exclusive lock but not be limited to 1) Our guess is some USER space fuse implementations might be relying on this lock for serialization. 2) The lock protects against file read/write size races. 3) Ruling out any issues arising from partial write failures. This patch relaxes the exclusive lock for direct non-extending writes only. File size extending writes might not need the lock either, but we are not entirely sure if there is a risk to introduce any kind of regression. Furthermore, benchmarking with fio does not show a difference between patch versions that take on file size extension a) an exclusive lock and b) a shared lock. A possible example of an issue with i_size extending writes are write error cases. Some writes might succeed and others might fail for file system internal reasons - for example ENOSPACE. With parallel file size extending writes it _might_ be difficult to revert the action of the failing write, especially to restore the right i_size. With these changes, we allow non-extending parallel direct writes on the same file with the help of a flag called FOPEN_PARALLEL_DIRECT_WRITES. If this flag is set on the file (flag is passed from libfuse to fuse kernel as part of file open/create), we do not take exclusive lock anymore, but instead use a shared lock that allows non-extending writes to run in parallel. FUSE implementations which rely on this inode lock for serialization can continue to do so and serialized direct writes are still the default. Implementations that do not do write serialization need to be updated and need to set the FOPEN_PARALLEL_DIRECT_WRITES flag in their file open/create reply. On patch review there were concerns that network file systems (or vfs multiple mounts of the same file system) might have issues with parallel writes. We believe this is not the case, as this is just a local lock, which network file systems could not rely on anyway. I.e. this lock is just for local consistency. Signed-off-by: Dharmendra Singh <dsingh@ddn.com> Signed-off-by: Bernd Schubert <bschubert@ddn.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2022-06-17 15:10:27 +08:00
/* For extending writes we already hold exclusive lock */
if (ret < 0 && offset + count > i_size)
fuse_do_truncate(file);
}
return ret;
}
static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end)
{
int err = filemap_write_and_wait_range(inode->i_mapping, start, LLONG_MAX);
if (!err)
fuse_sync_writes(inode);
return err;
}
static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
loff_t length)
{
struct fuse_file *ff = file->private_data;
struct inode *inode = file_inode(file);
struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_mount *fm = ff->fm;
FUSE_ARGS(args);
struct fuse_fallocate_in inarg = {
.fh = ff->fh,
.offset = offset,
.length = length,
.mode = mode
};
int err;
bool block_faults = FUSE_IS_DAX(inode) &&
(!(mode & FALLOC_FL_KEEP_SIZE) ||
(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)));
virtiofs: serialize truncate/punch_hole and dax fault path Currently in fuse we don't seem have any lock which can serialize fault path with truncate/punch_hole path. With dax support I need one for following reasons. 1. Dax requirement DAX fault code relies on inode size being stable for the duration of fault and want to serialize with truncate/punch_hole and they explicitly mention it. static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) /* * Check whether offset isn't beyond end of file now. Caller is * supposed to hold locks serializing us with truncate / punch hole so * this is a reliable test. */ max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 2. Make sure there are no users of pages being truncated/punch_hole get_user_pages() might take references to page and then do some DMA to said pages. Filesystem might truncate those pages without knowing that a DMA is in progress or some I/O is in progress. So use dax_layout_busy_page() to make sure there are no such references and I/O is not in progress on said pages before moving ahead with truncation. 3. Limitation of kvm page fault error reporting If we are truncating file on host first and then removing mappings in guest lateter (truncate page cache etc), then this could lead to a problem with KVM. Say a mapping is in place in guest and truncation happens on host. Now if guest accesses that mapping, then host will take a fault and kvm will either exit to qemu or spin infinitely. IOW, before we do truncation on host, we need to make sure that guest inode does not have any mapping in that region or whole file. 4. virtiofs memory range reclaim Soon I will introduce the notion of being able to reclaim dax memory ranges from a fuse dax inode. There also I need to make sure that no I/O or fault is going on in the reclaimed range and nobody is using it so that range can be reclaimed without issues. Currently if we take inode lock, that serializes read/write. But it does not do anything for faults. So I add another semaphore fuse_inode->i_mmap_sem for this purpose. It can be used to serialize with faults. As of now, I am adding taking this semaphore only in dax fault path and not regular fault path because existing code does not have one. May be existing code can benefit from it as well to take care of some races, but that we can fix later if need be. For now, I am just focussing only on DAX path which is new path. Also added logic to take fuse_inode->i_mmap_sem in truncate/punch_hole/open(O_TRUNC) path to make sure file truncation and fuse dax fault are mutually exlusive and avoid all the above problems. Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Cc: Dave Chinner <david@fromorbit.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-08-20 06:19:54 +08:00
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
FALLOC_FL_ZERO_RANGE))
return -EOPNOTSUPP;
if (fm->fc->no_fallocate)
return -EOPNOTSUPP;
inode_lock(inode);
if (block_faults) {
filemap_invalidate_lock(inode->i_mapping);
err = fuse_dax_break_layouts(inode, 0, 0);
if (err)
goto out;
}
virtiofs: serialize truncate/punch_hole and dax fault path Currently in fuse we don't seem have any lock which can serialize fault path with truncate/punch_hole path. With dax support I need one for following reasons. 1. Dax requirement DAX fault code relies on inode size being stable for the duration of fault and want to serialize with truncate/punch_hole and they explicitly mention it. static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) /* * Check whether offset isn't beyond end of file now. Caller is * supposed to hold locks serializing us with truncate / punch hole so * this is a reliable test. */ max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 2. Make sure there are no users of pages being truncated/punch_hole get_user_pages() might take references to page and then do some DMA to said pages. Filesystem might truncate those pages without knowing that a DMA is in progress or some I/O is in progress. So use dax_layout_busy_page() to make sure there are no such references and I/O is not in progress on said pages before moving ahead with truncation. 3. Limitation of kvm page fault error reporting If we are truncating file on host first and then removing mappings in guest lateter (truncate page cache etc), then this could lead to a problem with KVM. Say a mapping is in place in guest and truncation happens on host. Now if guest accesses that mapping, then host will take a fault and kvm will either exit to qemu or spin infinitely. IOW, before we do truncation on host, we need to make sure that guest inode does not have any mapping in that region or whole file. 4. virtiofs memory range reclaim Soon I will introduce the notion of being able to reclaim dax memory ranges from a fuse dax inode. There also I need to make sure that no I/O or fault is going on in the reclaimed range and nobody is using it so that range can be reclaimed without issues. Currently if we take inode lock, that serializes read/write. But it does not do anything for faults. So I add another semaphore fuse_inode->i_mmap_sem for this purpose. It can be used to serialize with faults. As of now, I am adding taking this semaphore only in dax fault path and not regular fault path because existing code does not have one. May be existing code can benefit from it as well to take care of some races, but that we can fix later if need be. For now, I am just focussing only on DAX path which is new path. Also added logic to take fuse_inode->i_mmap_sem in truncate/punch_hole/open(O_TRUNC) path to make sure file truncation and fuse dax fault are mutually exlusive and avoid all the above problems. Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Cc: Dave Chinner <david@fromorbit.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-08-20 06:19:54 +08:00
if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) {
loff_t endbyte = offset + length - 1;
err = fuse_writeback_range(inode, offset, endbyte);
if (err)
goto out;
}
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
offset + length > i_size_read(inode)) {
err = inode_newsize_ok(inode, offset + length);
if (err)
goto out;
}
err = file_modified(file);
if (err)
goto out;
if (!(mode & FALLOC_FL_KEEP_SIZE))
set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
args.opcode = FUSE_FALLOCATE;
args.nodeid = ff->nodeid;
args.in_numargs = 1;
args.in_args[0].size = sizeof(inarg);
args.in_args[0].value = &inarg;
err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
fm->fc->no_fallocate = 1;
err = -EOPNOTSUPP;
}
if (err)
goto out;
/* we could have extended the file */
if (!(mode & FALLOC_FL_KEEP_SIZE)) {
if (fuse_write_update_attr(inode, offset + length, length))
file_update_time(file);
}
if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
truncate_pagecache_range(inode, offset, offset + length - 1);
fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
out:
if (!(mode & FALLOC_FL_KEEP_SIZE))
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
virtiofs: serialize truncate/punch_hole and dax fault path Currently in fuse we don't seem have any lock which can serialize fault path with truncate/punch_hole path. With dax support I need one for following reasons. 1. Dax requirement DAX fault code relies on inode size being stable for the duration of fault and want to serialize with truncate/punch_hole and they explicitly mention it. static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) /* * Check whether offset isn't beyond end of file now. Caller is * supposed to hold locks serializing us with truncate / punch hole so * this is a reliable test. */ max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 2. Make sure there are no users of pages being truncated/punch_hole get_user_pages() might take references to page and then do some DMA to said pages. Filesystem might truncate those pages without knowing that a DMA is in progress or some I/O is in progress. So use dax_layout_busy_page() to make sure there are no such references and I/O is not in progress on said pages before moving ahead with truncation. 3. Limitation of kvm page fault error reporting If we are truncating file on host first and then removing mappings in guest lateter (truncate page cache etc), then this could lead to a problem with KVM. Say a mapping is in place in guest and truncation happens on host. Now if guest accesses that mapping, then host will take a fault and kvm will either exit to qemu or spin infinitely. IOW, before we do truncation on host, we need to make sure that guest inode does not have any mapping in that region or whole file. 4. virtiofs memory range reclaim Soon I will introduce the notion of being able to reclaim dax memory ranges from a fuse dax inode. There also I need to make sure that no I/O or fault is going on in the reclaimed range and nobody is using it so that range can be reclaimed without issues. Currently if we take inode lock, that serializes read/write. But it does not do anything for faults. So I add another semaphore fuse_inode->i_mmap_sem for this purpose. It can be used to serialize with faults. As of now, I am adding taking this semaphore only in dax fault path and not regular fault path because existing code does not have one. May be existing code can benefit from it as well to take care of some races, but that we can fix later if need be. For now, I am just focussing only on DAX path which is new path. Also added logic to take fuse_inode->i_mmap_sem in truncate/punch_hole/open(O_TRUNC) path to make sure file truncation and fuse dax fault are mutually exlusive and avoid all the above problems. Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Cc: Dave Chinner <david@fromorbit.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-08-20 06:19:54 +08:00
if (block_faults)
filemap_invalidate_unlock(inode->i_mapping);
virtiofs: serialize truncate/punch_hole and dax fault path Currently in fuse we don't seem have any lock which can serialize fault path with truncate/punch_hole path. With dax support I need one for following reasons. 1. Dax requirement DAX fault code relies on inode size being stable for the duration of fault and want to serialize with truncate/punch_hole and they explicitly mention it. static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) /* * Check whether offset isn't beyond end of file now. Caller is * supposed to hold locks serializing us with truncate / punch hole so * this is a reliable test. */ max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 2. Make sure there are no users of pages being truncated/punch_hole get_user_pages() might take references to page and then do some DMA to said pages. Filesystem might truncate those pages without knowing that a DMA is in progress or some I/O is in progress. So use dax_layout_busy_page() to make sure there are no such references and I/O is not in progress on said pages before moving ahead with truncation. 3. Limitation of kvm page fault error reporting If we are truncating file on host first and then removing mappings in guest lateter (truncate page cache etc), then this could lead to a problem with KVM. Say a mapping is in place in guest and truncation happens on host. Now if guest accesses that mapping, then host will take a fault and kvm will either exit to qemu or spin infinitely. IOW, before we do truncation on host, we need to make sure that guest inode does not have any mapping in that region or whole file. 4. virtiofs memory range reclaim Soon I will introduce the notion of being able to reclaim dax memory ranges from a fuse dax inode. There also I need to make sure that no I/O or fault is going on in the reclaimed range and nobody is using it so that range can be reclaimed without issues. Currently if we take inode lock, that serializes read/write. But it does not do anything for faults. So I add another semaphore fuse_inode->i_mmap_sem for this purpose. It can be used to serialize with faults. As of now, I am adding taking this semaphore only in dax fault path and not regular fault path because existing code does not have one. May be existing code can benefit from it as well to take care of some races, but that we can fix later if need be. For now, I am just focussing only on DAX path which is new path. Also added logic to take fuse_inode->i_mmap_sem in truncate/punch_hole/open(O_TRUNC) path to make sure file truncation and fuse dax fault are mutually exlusive and avoid all the above problems. Signed-off-by: Vivek Goyal <vgoyal@redhat.com> Cc: Dave Chinner <david@fromorbit.com> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2020-08-20 06:19:54 +08:00
inode_unlock(inode);
fuse_flush_time_update(inode);
return err;
}
static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
size_t len, unsigned int flags)
{
struct fuse_file *ff_in = file_in->private_data;
struct fuse_file *ff_out = file_out->private_data;
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
struct fuse_inode *fi_out = get_fuse_inode(inode_out);
struct fuse_mount *fm = ff_in->fm;
struct fuse_conn *fc = fm->fc;
FUSE_ARGS(args);
struct fuse_copy_file_range_in inarg = {
.fh_in = ff_in->fh,
.off_in = pos_in,
.nodeid_out = ff_out->nodeid,
.fh_out = ff_out->fh,
.off_out = pos_out,
.len = len,
.flags = flags
};
struct fuse_write_out outarg;
ssize_t err;
/* mark unstable when write-back is not used, and file_out gets
* extended */
bool is_unstable = (!fc->writeback_cache) &&
((pos_out + len) > inode_out->i_size);
if (fc->no_copy_file_range)
return -EOPNOTSUPP;
if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
return -EXDEV;
inode_lock(inode_in);
err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1);
inode_unlock(inode_in);
if (err)
return err;
inode_lock(inode_out);
err = file_modified(file_out);
if (err)
goto out;
/*
* Write out dirty pages in the destination file before sending the COPY
* request to userspace. After the request is completed, truncate off
* pages (including partial ones) from the cache that have been copied,
* since these contain stale data at that point.
*
* This should be mostly correct, but if the COPY writes to partial
* pages (at the start or end) and the parts not covered by the COPY are
* written through a memory map after calling fuse_writeback_range(),
* then these partial page modifications will be lost on truncation.
*
* It is unlikely that someone would rely on such mixed style
* modifications. Yet this does give less guarantees than if the
* copying was performed with write(2).
*
* To fix this a mapping->invalidate_lock could be used to prevent new
* faults while the copy is ongoing.
*/
err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1);
if (err)
goto out;
if (is_unstable)
set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
args.opcode = FUSE_COPY_FILE_RANGE;
args.nodeid = ff_in->nodeid;
args.in_numargs = 1;
args.in_args[0].size = sizeof(inarg);
args.in_args[0].value = &inarg;
args.out_numargs = 1;
args.out_args[0].size = sizeof(outarg);
args.out_args[0].value = &outarg;
err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
fc->no_copy_file_range = 1;
err = -EOPNOTSUPP;
}
if (err)
goto out;
truncate_inode_pages_range(inode_out->i_mapping,
ALIGN_DOWN(pos_out, PAGE_SIZE),
ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1);
file_update_time(file_out);
fuse_write_update_attr(inode_out, pos_out + outarg.size, outarg.size);
err = outarg.size;
out:
if (is_unstable)
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
inode_unlock(inode_out);
file_accessed(file_in);
fuse_flush_time_update(inode_out);
return err;
}
static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off,
struct file *dst_file, loff_t dst_off,
size_t len, unsigned int flags)
{
ssize_t ret;
ret = __fuse_copy_file_range(src_file, src_off, dst_file, dst_off,
len, flags);
if (ret == -EOPNOTSUPP || ret == -EXDEV)
ret = generic_copy_file_range(src_file, src_off, dst_file,
dst_off, len, flags);
return ret;
}
static const struct file_operations fuse_file_operations = {
.llseek = fuse_file_llseek,
.read_iter = fuse_file_read_iter,
.write_iter = fuse_file_write_iter,
.mmap = fuse_file_mmap,
.open = fuse_open,
.flush = fuse_flush,
.release = fuse_release,
.fsync = fuse_fsync,
.lock = fuse_file_lock,
.get_unmapped_area = thp_get_unmapped_area,
.flock = fuse_file_flock,
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
.unlocked_ioctl = fuse_file_ioctl,
.compat_ioctl = fuse_file_compat_ioctl,
.poll = fuse_file_poll,
.fallocate = fuse_file_fallocate,
.copy_file_range = fuse_copy_file_range,
};
static const struct address_space_operations fuse_file_aops = {
.read_folio = fuse_read_folio,
.readahead = fuse_readahead,
fuse: support writable mmap Quoting Linus (3 years ago, FUSE inclusion discussions): "User-space filesystems are hard to get right. I'd claim that they are almost impossible, unless you limit them somehow (shared writable mappings are the nastiest part - if you don't have those, you can reasonably limit your problems by limiting the number of dirty pages you accept through normal "write()" calls)." Instead of attempting the impossible, I've just waited for the dirty page accounting infrastructure to materialize (thanks to Peter Zijlstra and others). This nicely solved the biggest problem: limiting the number of pages used for write caching. Some small details remained, however, which this largish patch attempts to address. It provides a page writeback implementation for fuse, which is completely safe against VM related deadlocks. Performance may not be very good for certain usage patterns, but generally it should be acceptable. It has been tested extensively with fsx-linux and bash-shared-mapping. Fuse page writeback design -------------------------- fuse_writepage() allocates a new temporary page with GFP_NOFS|__GFP_HIGHMEM. It copies the contents of the original page, and queues a WRITE request to the userspace filesystem using this temp page. The writeback is finished instantly from the MM's point of view: the page is removed from the radix trees, and the PageDirty and PageWriteback flags are cleared. For the duration of the actual write, the NR_WRITEBACK_TEMP counter is incremented. The per-bdi writeback count is not decremented until the actual write completes. On dirtying the page, fuse waits for a previous write to finish before proceeding. This makes sure, there can only be one temporary page used at a time for one cached page. This approach is wasteful in both memory and CPU bandwidth, so why is this complication needed? The basic problem is that there can be no guarantee about the time in which the userspace filesystem will complete a write. It may be buggy or even malicious, and fail to complete WRITE requests. We don't want unrelated parts of the system to grind to a halt in such cases. Also a filesystem may need additional resources (particularly memory) to complete a WRITE request. There's a great danger of a deadlock if that allocation may wait for the writepage to finish. Currently there are several cases where the kernel can block on page writeback: - allocation order is larger than PAGE_ALLOC_COSTLY_ORDER - page migration - throttle_vm_writeout (through NR_WRITEBACK) - sync(2) Of course in some cases (fsync, msync) we explicitly want to allow blocking. So for these cases new code has to be added to fuse, since the VM is not tracking writeback pages for us any more. As an extra safetly measure, the maximum dirty ratio allocated to a single fuse filesystem is set to 1% by default. This way one (or several) buggy or malicious fuse filesystems cannot slow down the rest of the system by hogging dirty memory. With appropriate privileges, this limit can be raised through '/sys/class/bdi/<bdi>/max_ratio'. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:54:41 +08:00
.writepage = fuse_writepage,
.writepages = fuse_writepages,
.launder_folio = fuse_launder_folio,
.dirty_folio = filemap_dirty_folio,
.bmap = fuse_bmap,
.direct_IO = fuse_direct_IO,
.write_begin = fuse_write_begin,
.write_end = fuse_write_end,
};
void fuse_init_file_inode(struct inode *inode, unsigned int flags)
{
struct fuse_inode *fi = get_fuse_inode(inode);
inode->i_fop = &fuse_file_operations;
inode->i_data.a_ops = &fuse_file_aops;
INIT_LIST_HEAD(&fi->write_files);
INIT_LIST_HEAD(&fi->queued_writes);
fi->writectr = 0;
init_waitqueue_head(&fi->page_waitq);
fi->writepages = RB_ROOT;
if (IS_ENABLED(CONFIG_FUSE_DAX))
fuse_dax_inode_init(inode, flags);
}