Enable passthrough mode for read/write operations (#919)

Add support for filesystem passthrough read/write of files.

When the FUSE_PASSTHROUGH capability is enabled, the FUSE server may
decide, while handling the "open" or "create" requests, if the given
file can be accessed by that process in "passthrough" mode, meaning that
all the further read and write operations would be forwarded by the
kernel directly to the backing file rather than to the FUSE server.
All requests other than read or write are still handled by the server.

This allows for an improved performance on reads and writes, especially
in the case of reads at random offsets, for which no (readahead)
caching mechanism would help, reducing the performance gap between FUSE
and native filesystem access.

Extend also the passthrough_hp example with the new passthrough feature.
This example opens a kernel backing file per FUSE inode on the first
FUSE file open of that inode and closes the backing file on the release
of the last FUSE file on that inode.

All opens of the same inode passthrough to the same backing file.
A combination of fi->direct_io and fi->passthrough is allowed.
It means that read/write operations go directly to the server, but mmap
is done on the backing file.

This allows to open some fds of the inode in passthrough mode and some
fd of the same inode in direct_io/passthrough_mmap mode.

Signed-off-by: Alessio Balsini <balsini@android.com>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
This commit is contained in:
Amir Goldstein 2024-05-13 16:30:25 +03:00 committed by GitHub
parent 58f85bfa9b
commit eca63dab45
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 180 additions and 6 deletions

View File

@ -123,6 +123,7 @@ struct Inode {
dev_t src_dev {0};
ino_t src_ino {0};
int generation {0};
int backing_id {0};
uint64_t nopen {0};
uint64_t nlookup {0};
std::mutex m;
@ -159,6 +160,7 @@ struct Fs {
bool clone_fd;
std::string fuse_mount_options;
bool direct_io;
bool passthrough;
};
static Fs fs{};
@ -190,7 +192,15 @@ static int get_fs_fd(fuse_ino_t ino) {
static void sfs_init(void *userdata, fuse_conn_info *conn) {
(void)userdata;
if (fs.timeout && conn->capable & FUSE_CAP_WRITEBACK_CACHE)
if (fs.passthrough && conn->capable & FUSE_CAP_PASSTHROUGH)
conn->want |= FUSE_CAP_PASSTHROUGH;
else
fs.passthrough = false;
/* Passthrough and writeback cache are conflicting modes */
if (fs.timeout && !fs.passthrough &&
conn->capable & FUSE_CAP_WRITEBACK_CACHE)
conn->want |= FUSE_CAP_WRITEBACK_CACHE;
if (conn->capable & FUSE_CAP_FLOCK_LOCKS)
@ -810,6 +820,30 @@ static void sfs_releasedir(fuse_req_t req, fuse_ino_t ino, fuse_file_info *fi) {
}
static void do_passthrough_open(fuse_req_t req, fuse_ino_t ino, int fd,
fuse_file_info *fi) {
Inode& inode = get_inode(ino);
/* Setup a shared backing file on first open of an inode */
if (inode.backing_id) {
if (fs.debug)
cerr << "DEBUG: reusing shared backing file "
<< inode.backing_id << " for inode " << ino << endl;
fi->backing_id = inode.backing_id;
} else if (!(inode.backing_id = fuse_passthrough_open(req, fd))) {
cerr << "DEBUG: fuse_passthrough_open failed for inode " << ino
<< ", disabling rw passthrough." << endl;
fs.passthrough = false;
} else {
if (fs.debug)
cerr << "DEBUG: setup shared backing file "
<< inode.backing_id << " for inode " << ino << endl;
fi->backing_id = inode.backing_id;
}
/* open in passthrough mode must drop old page cache */
if (fi->backing_id)
fi->keep_cache = false;
}
static void sfs_create(fuse_req_t req, fuse_ino_t parent, const char *name,
mode_t mode, fuse_file_info *fi) {
Inode& inode_p = get_inode(parent);
@ -845,6 +879,8 @@ static void sfs_create(fuse_req_t req, fuse_ino_t parent, const char *name,
Inode& inode = get_inode(e.ino);
lock_guard<mutex> g {inode.m};
inode.nopen++;
if (fs.passthrough)
do_passthrough_open(req, e.ino, fd, fi);
fuse_reply_create(req, &e, fi);
}
@ -914,6 +950,8 @@ static void sfs_open(fuse_req_t req, fuse_ino_t ino, fuse_file_info *fi) {
fi->parallel_direct_writes = 1;
fi->fh = fd;
if (fs.passthrough)
do_passthrough_open(req, ino, fd, fi);
fuse_reply_open(req, fi);
}
@ -922,6 +960,19 @@ static void sfs_release(fuse_req_t req, fuse_ino_t ino, fuse_file_info *fi) {
Inode& inode = get_inode(ino);
lock_guard<mutex> g {inode.m};
inode.nopen--;
/* Close the shared backing file on last file close of an inode */
if (inode.backing_id && !inode.nopen) {
if (fuse_passthrough_close(req, inode.backing_id) < 0) {
cerr << "DEBUG: fuse_passthrough_close failed for inode "
<< ino << " backing file " << inode.backing_id << endl;
} else if (fs.debug) {
cerr << "DEBUG: closed backing file " << inode.backing_id
<< " for inode " << ino << endl;
}
inode.backing_id = 0;
}
close(fi->fh);
fuse_reply_err(req, 0);
}
@ -960,6 +1011,11 @@ static void do_read(fuse_req_t req, size_t size, off_t off, fuse_file_info *fi)
static void sfs_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off,
fuse_file_info *fi) {
(void) ino;
if (fs.passthrough && !fs.direct_io) {
cerr << "ERROR: fuse_passthrough read failed." << endl;
fuse_reply_err(req, EIO);
return;
}
do_read(req, size, off, fi);
}
@ -983,6 +1039,11 @@ static void do_write_buf(fuse_req_t req, size_t size, off_t off,
static void sfs_write_buf(fuse_req_t req, fuse_ino_t ino, fuse_bufvec *in_buf,
off_t off, fuse_file_info *fi) {
(void) ino;
if (fs.passthrough && !fs.direct_io) {
cerr << "ERROR: fuse_passthrough write failed." << endl;
fuse_reply_err(req, EIO);
return;
}
auto size {fuse_buf_size(in_buf)};
do_write_buf(req, size, off, in_buf, fi);
}
@ -1232,6 +1293,7 @@ static cxxopts::ParseResult parse_options(int argc, char **argv) {
("help", "Print help")
("nocache", "Disable attribute all caching")
("nosplice", "Do not use splice(2) to transfer data")
("nopassthrough", "Do not use pass-through mode for read/write")
("single", "Run single-threaded")
("o", "Mount options (see mount.fuse(5) - only use if you know what "
"you are doing)", cxxopts::value(mount_options))
@ -1240,7 +1302,6 @@ static cxxopts::ParseResult parse_options(int argc, char **argv) {
("clone-fd", "use separate fuse device fd for each thread")
("direct-io", "enable fuse kernel internal direct-io");
// FIXME: Find a better way to limit the try clause to just
// opt_parser.parse() (cf. https://github.com/jarro2783/cxxopts/issues/146)
auto options = parse_wrapper(opt_parser, argc, argv);
@ -1268,6 +1329,7 @@ static cxxopts::ParseResult parse_options(int argc, char **argv) {
fs.foreground = true;
fs.nosplice = options.count("nosplice") != 0;
fs.passthrough = options.count("nopassthrough") == 0;
fs.num_threads = options["num-threads"].as<int>();
fs.clone_fd = options.count("clone-fd");
fs.direct_io = options.count("direct-io");

View File

@ -105,6 +105,11 @@ struct fuse_file_info {
/** Requested poll events. Available in ->poll. Only set on kernels
which support it. If unsupported, this field is set to zero. */
uint32_t poll_events;
/** Passthrough backing file id. May be filled in by filesystem in
* create and open. It is used to create a passthrough connection
* between FUSE file and backing file. */
int32_t backing_id;
};
@ -468,6 +473,18 @@ struct fuse_loop_config_v1 {
*/
#define FUSE_CAP_DIRECT_IO_ALLOW_MMAP (1 << 28)
/**
* Indicates support for passthrough mode access for read/write operations.
*
* If this flag is set in the `capable` field of the `fuse_conn_info`
* structure, then the FUSE kernel module supports redirecting read/write
* operations to the backing file instead of letting them to be handled
* by the FUSE daemon.
*
* This feature is disabled by default.
*/
#define FUSE_CAP_PASSTHROUGH (1 << 29)
/**
* Ioctl flags
*
@ -597,10 +614,30 @@ struct fuse_conn_info {
*/
unsigned time_gran;
/**
* When FUSE_CAP_PASSTHROUGH is enabled, this is the maximum allowed
* stacking depth of the backing files. In current kernel, the maximum
* allowed stack depth if FILESYSTEM_MAX_STACK_DEPTH (2), which includes
* the FUSE passthrough layer, so the maximum stacking depth for backing
* files is 1.
*
* The default is FUSE_BACKING_STACKED_UNDER (0), meaning that the
* backing files cannot be on a stacked filesystem, but another stacked
* filesystem can be stacked over this FUSE passthrough filesystem.
*
* Set this to FUSE_BACKING_STACKED_OVER (1) if backing files may be on
* a stacked filesystem, such as overlayfs or another FUSE passthrough.
* In this configuration, another stacked filesystem cannot be stacked
* over this FUSE passthrough filesystem.
*/
#define FUSE_BACKING_STACKED_UNDER (0)
#define FUSE_BACKING_STACKED_OVER (1)
unsigned max_backing_stack_depth;
/**
* For future use.
*/
unsigned reserved[22];
unsigned reserved[21];
};
struct fuse_session;

View File

@ -211,6 +211,10 @@
* 7.39
* - add FUSE_DIRECT_IO_ALLOW_MMAP
* - add FUSE_STATX and related structures
*
* 7.40
* - add max_stack_depth to fuse_init_out, add FUSE_PASSTHROUGH init flag
* - add backing_id to fuse_open_out, add FOPEN_PASSTHROUGH open flag
*/
#ifndef _LINUX_FUSE_H
@ -246,7 +250,7 @@
#define FUSE_KERNEL_VERSION 7
/** Minor version number of this interface */
#define FUSE_KERNEL_MINOR_VERSION 39
#define FUSE_KERNEL_MINOR_VERSION 40
/** The node ID of the root inode */
#define FUSE_ROOT_ID 1
@ -353,6 +357,7 @@ struct fuse_file_lock {
* FOPEN_STREAM: the file is stream-like (no file position at all)
* FOPEN_NOFLUSH: don't flush data cache on close (unless FUSE_WRITEBACK_CACHE)
* FOPEN_PARALLEL_DIRECT_WRITES: Allow concurrent direct writes on the same inode
* FOPEN_PASSTHROUGH: passthrough read/write operations for this open file
*/
#define FOPEN_DIRECT_IO (1 << 0)
#define FOPEN_KEEP_CACHE (1 << 1)
@ -361,6 +366,7 @@ struct fuse_file_lock {
#define FOPEN_STREAM (1 << 4)
#define FOPEN_NOFLUSH (1 << 5)
#define FOPEN_PARALLEL_DIRECT_WRITES (1 << 6)
#define FOPEN_PASSTHROUGH (1 << 7)
/**
* INIT request/reply flags
@ -449,6 +455,7 @@ struct fuse_file_lock {
#define FUSE_CREATE_SUPP_GROUP (1ULL << 34)
#define FUSE_HAS_EXPIRE_ONLY (1ULL << 35)
#define FUSE_DIRECT_IO_ALLOW_MMAP (1ULL << 36)
#define FUSE_PASSTHROUGH (1ULL << 37)
/* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */
#define FUSE_DIRECT_IO_RELAX FUSE_DIRECT_IO_ALLOW_MMAP
@ -761,7 +768,7 @@ struct fuse_create_in {
struct fuse_open_out {
uint64_t fh;
uint32_t open_flags;
uint32_t padding;
int32_t backing_id;
};
struct fuse_release_in {
@ -877,7 +884,8 @@ struct fuse_init_out {
uint16_t max_pages;
uint16_t map_alignment;
uint32_t flags2;
uint32_t unused[7];
uint32_t max_stack_depth;
uint32_t unused[6];
};
#define CUSE_INIT_INFO_MAX 4096
@ -1049,9 +1057,18 @@ struct fuse_notify_retrieve_in {
uint64_t dummy4;
};
struct fuse_backing_map {
int32_t fd;
uint32_t flags;
uint64_t padding;
};
/* Device ioctls: */
#define FUSE_DEV_IOC_MAGIC 229
#define FUSE_DEV_IOC_CLONE _IOR(FUSE_DEV_IOC_MAGIC, 0, uint32_t)
#define FUSE_DEV_IOC_BACKING_OPEN _IOW(FUSE_DEV_IOC_MAGIC, 1, \
struct fuse_backing_map)
#define FUSE_DEV_IOC_BACKING_CLOSE _IOW(FUSE_DEV_IOC_MAGIC, 2, uint32_t)
struct fuse_lseek_in {
uint64_t fh;

View File

@ -1396,6 +1396,19 @@ int fuse_reply_attr(fuse_req_t req, const struct stat *attr,
*/
int fuse_reply_readlink(fuse_req_t req, const char *link);
/**
* Setup passthrough backing file for open reply
*
* Possible requests:
* open, opendir, create
*
* @param req request handle
* @param fd backing file descriptor
* @return positive backing id for success, 0 for failure
*/
int fuse_passthrough_open(fuse_req_t req, int fd);
int fuse_passthrough_close(fuse_req_t req, int backing_id);
/**
* Reply with open parameters
*

View File

@ -27,6 +27,7 @@
#include <errno.h>
#include <assert.h>
#include <sys/file.h>
#include <sys/ioctl.h>
#ifndef F_LINUX_SPECIFIC_BASE
#define F_LINUX_SPECIFIC_BASE 1024
@ -400,6 +401,10 @@ static void fill_open(struct fuse_open_out *arg,
const struct fuse_file_info *f)
{
arg->fh = f->fh;
if (f->backing_id > 0) {
arg->backing_id = f->backing_id;
arg->open_flags |= FOPEN_PASSTHROUGH;
}
if (f->direct_io)
arg->open_flags |= FOPEN_DIRECT_IO;
if (f->keep_cache)
@ -466,6 +471,31 @@ int fuse_reply_readlink(fuse_req_t req, const char *linkname)
return send_reply_ok(req, linkname, strlen(linkname));
}
int fuse_passthrough_open(fuse_req_t req, int fd)
{
struct fuse_backing_map map = { .fd = fd };
int ret;
ret = ioctl(req->se->fd, FUSE_DEV_IOC_BACKING_OPEN, &map);
if (ret <= 0) {
fuse_log(FUSE_LOG_ERR, "fuse: passthrough_open: %s\n", strerror(errno));
return 0;
}
return ret;
}
int fuse_passthrough_close(fuse_req_t req, int backing_id)
{
int ret;
ret = ioctl(req->se->fd, FUSE_DEV_IOC_BACKING_CLOSE, &backing_id);
if (ret < 0)
fuse_log(FUSE_LOG_ERR, "fuse: passthrough_close: %s\n", strerror(errno));
return ret;
}
int fuse_reply_open(fuse_req_t req, const struct fuse_file_info *f)
{
struct fuse_open_out arg;
@ -2027,6 +2057,8 @@ void do_init(fuse_req_t req, fuse_ino_t nodeid, const void *inarg)
se->conn.capable |= FUSE_CAP_DIRECT_IO_ALLOW_MMAP;
if (arg->minor >= 38 || (inargflags & FUSE_HAS_EXPIRE_ONLY))
se->conn.capable |= FUSE_CAP_EXPIRE_ONLY;
if (inargflags & FUSE_PASSTHROUGH)
se->conn.capable |= FUSE_CAP_PASSTHROUGH;
} else {
se->conn.max_readahead = 0;
}
@ -2161,6 +2193,14 @@ void do_init(fuse_req_t req, fuse_ino_t nodeid, const void *inarg)
outargflags |= FUSE_SETXATTR_EXT;
if (se->conn.want & FUSE_CAP_DIRECT_IO_ALLOW_MMAP)
outargflags |= FUSE_DIRECT_IO_ALLOW_MMAP;
if (se->conn.want & FUSE_CAP_PASSTHROUGH) {
outargflags |= FUSE_PASSTHROUGH;
/*
* outarg.max_stack_depth includes the fuse stack layer,
* so it is one more than max_backing_stack_depth.
*/
outarg.max_stack_depth = se->conn.max_backing_stack_depth + 1;
}
if (inargflags & FUSE_INIT_EXT) {
outargflags |= FUSE_INIT_EXT;
@ -2199,6 +2239,9 @@ void do_init(fuse_req_t req, fuse_ino_t nodeid, const void *inarg)
outarg.congestion_threshold);
fuse_log(FUSE_LOG_DEBUG, " time_gran=%u\n",
outarg.time_gran);
if (se->conn.want & FUSE_CAP_PASSTHROUGH)
fuse_log(FUSE_LOG_DEBUG, " max_stack_depth=%u\n",
outarg.max_stack_depth);
}
if (arg->minor < 5)
outargsize = FUSE_COMPAT_INIT_OUT_SIZE;

View File

@ -194,6 +194,8 @@ FUSE_3.17 {
_fuse_new_30;
_fuse_new_317;
fuse_main_real_317;
fuse_passthrough_open;
fuse_passthrough_close;
} FUSE_3.12;
# Local Variables: