diff --git a/Documentation/filesystems/netfs_library.rst b/Documentation/filesystems/netfs_library.rst index 4cc657d743f7..f0d2cb257bb8 100644 --- a/Documentation/filesystems/netfs_library.rst +++ b/Documentation/filesystems/netfs_library.rst @@ -116,7 +116,7 @@ The following services are provided: * Handle local caching, allowing cached data and server-read data to be interleaved for a single request. - * Handle clearing of bufferage that aren't on the server. + * Handle clearing of bufferage that isn't on the server. * Handle retrying of reads that failed, switching reads from the cache to the server as necessary. diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 24fdc74caeba..819c75233235 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -68,17 +68,22 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; struct p9_fid *fid = rreq->netfs_priv; + unsigned long long pos = subreq->start + subreq->transferred; int total, err; - total = p9_client_read(fid, subreq->start + subreq->transferred, - &subreq->io_iter, &err); + total = p9_client_read(fid, pos, &subreq->io_iter, &err); /* if we just extended the file size, any portion not in * cache won't be on server and is zeroes */ if (subreq->rreq->origin != NETFS_DIO_READ) __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + if (pos + total >= i_size_read(rreq->inode)) + __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); - netfs_subreq_terminated(subreq, err ?: total, false); + if (!err) + subreq->transferred += total; + + netfs_read_subreq_terminated(subreq, err, false); } /** diff --git a/fs/afs/file.c b/fs/afs/file.c index ec1be0091fdb..492d857a3fa0 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "internal.h" static int afs_file_mmap(struct file *file, struct vm_area_struct *vma); @@ -242,9 +243,10 @@ static void afs_fetch_data_notify(struct afs_operation *op) req->error = error; if (subreq) { - if (subreq->rreq->origin != NETFS_DIO_READ) - __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); - netfs_subreq_terminated(subreq, error ?: req->actual_len, false); + subreq->rreq->i_size = req->file_size; + if (req->pos + req->actual_len >= req->file_size) + __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); + netfs_read_subreq_terminated(subreq, error, false); req->subreq = NULL; } else if (req->done) { req->done(req); @@ -262,6 +264,12 @@ static void afs_fetch_data_success(struct afs_operation *op) afs_fetch_data_notify(op); } +static void afs_fetch_data_aborted(struct afs_operation *op) +{ + afs_check_for_remote_deletion(op); + afs_fetch_data_notify(op); +} + static void afs_fetch_data_put(struct afs_operation *op) { op->fetch.req->error = afs_op_error(op); @@ -272,7 +280,7 @@ static const struct afs_operation_ops afs_fetch_data_operation = { .issue_afs_rpc = afs_fs_fetch_data, .issue_yfs_rpc = yfs_fs_fetch_data, .success = afs_fetch_data_success, - .aborted = afs_check_for_remote_deletion, + .aborted = afs_fetch_data_aborted, .failed = afs_fetch_data_notify, .put = afs_fetch_data_put, }; @@ -294,7 +302,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req) op = afs_alloc_operation(req->key, vnode->volume); if (IS_ERR(op)) { if (req->subreq) - netfs_subreq_terminated(req->subreq, PTR_ERR(op), false); + netfs_read_subreq_terminated(req->subreq, PTR_ERR(op), false); return PTR_ERR(op); } @@ -305,14 +313,15 @@ int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req) return afs_do_sync_operation(op); } -static void afs_issue_read(struct netfs_io_subrequest *subreq) +static void afs_read_worker(struct work_struct *work) { + struct netfs_io_subrequest *subreq = container_of(work, struct netfs_io_subrequest, work); struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode); struct afs_read *fsreq; fsreq = afs_alloc_read(GFP_NOFS); if (!fsreq) - return netfs_subreq_terminated(subreq, -ENOMEM, false); + return netfs_read_subreq_terminated(subreq, -ENOMEM, false); fsreq->subreq = subreq; fsreq->pos = subreq->start + subreq->transferred; @@ -321,10 +330,17 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq) fsreq->vnode = vnode; fsreq->iter = &subreq->io_iter; + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); afs_fetch_data(fsreq->vnode, fsreq); afs_put_read(fsreq); } +static void afs_issue_read(struct netfs_io_subrequest *subreq) +{ + INIT_WORK(&subreq->work, afs_read_worker); + queue_work(system_long_wq, &subreq->work); +} + static int afs_symlink_read_folio(struct file *file, struct folio *folio) { struct afs_vnode *vnode = AFS_FS_I(folio->mapping->host); diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 79cd30775b7a..098fa034a1cc 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -304,6 +304,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) struct afs_vnode_param *vp = &op->file[0]; struct afs_read *req = op->fetch.req; const __be32 *bp; + size_t count_before; int ret; _enter("{%u,%zu,%zu/%llu}", @@ -345,10 +346,14 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) /* extract the returned data */ case 2: - _debug("extract data %zu/%llu", - iov_iter_count(call->iter), req->actual_len); + count_before = call->iov_len; + _debug("extract data %zu/%llu", count_before, req->actual_len); ret = afs_extract_data(call, true); + if (req->subreq) { + req->subreq->transferred += count_before - call->iov_len; + netfs_read_subreq_progress(req->subreq, false); + } if (ret < 0) return ret; diff --git a/fs/afs/write.c b/fs/afs/write.c index e959640694c2..34107b55f834 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -89,10 +89,12 @@ static const struct afs_operation_ops afs_store_data_operation = { */ void afs_prepare_write(struct netfs_io_subrequest *subreq) { + struct netfs_io_stream *stream = &subreq->rreq->io_streams[subreq->stream_nr]; + //if (test_bit(NETFS_SREQ_RETRYING, &subreq->flags)) // subreq->max_len = 512 * 1024; //else - subreq->max_len = 256 * 1024 * 1024; + stream->sreq_max_len = 256 * 1024 * 1024; } /* diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index f521e66d3bf6..024227aba4cd 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -355,6 +355,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) struct afs_vnode_param *vp = &op->file[0]; struct afs_read *req = op->fetch.req; const __be32 *bp; + size_t count_before; int ret; _enter("{%u,%zu, %zu/%llu}", @@ -391,10 +392,14 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) /* extract the returned data */ case 2: - _debug("extract data %zu/%llu", - iov_iter_count(call->iter), req->actual_len); + count_before = call->iov_len; + _debug("extract data %zu/%llu", count_before, req->actual_len); ret = afs_extract_data(call, true); + if (req->subreq) { + req->subreq->transferred += count_before - call->iov_len; + netfs_read_subreq_progress(req->subreq, false); + } if (ret < 0) return ret; diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index a91acd03ee12..6a821a959b59 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -627,11 +627,12 @@ static void cachefiles_prepare_write_subreq(struct netfs_io_subrequest *subreq) { struct netfs_io_request *wreq = subreq->rreq; struct netfs_cache_resources *cres = &wreq->cache_resources; + struct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr]; _enter("W=%x[%x] %llx", wreq->debug_id, subreq->debug_index, subreq->start); - subreq->max_len = MAX_RW_COUNT; - subreq->max_nr_segs = BIO_MAX_VECS; + stream->sreq_max_len = MAX_RW_COUNT; + stream->sreq_max_segs = BIO_MAX_VECS; if (!cachefiles_cres_file(cres)) { if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE)) @@ -647,6 +648,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq) struct netfs_cache_resources *cres = &wreq->cache_resources; struct cachefiles_object *object = cachefiles_cres_object(cres); struct cachefiles_cache *cache = object->volume->cache; + struct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr]; const struct cred *saved_cred; size_t off, pre, post, len = subreq->len; loff_t start = subreq->start; @@ -660,6 +662,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq) if (off) { pre = CACHEFILES_DIO_BLOCK_SIZE - off; if (pre >= len) { + fscache_count_dio_misfit(); netfs_write_subrequest_terminated(subreq, len, false); return; } @@ -670,10 +673,22 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq) } /* We also need to end on the cache granularity boundary */ + if (start + len == wreq->i_size) { + size_t part = len % CACHEFILES_DIO_BLOCK_SIZE; + size_t need = CACHEFILES_DIO_BLOCK_SIZE - part; + + if (part && stream->submit_extendable_to >= need) { + len += need; + subreq->len += need; + subreq->io_iter.count += need; + } + } + post = len & (CACHEFILES_DIO_BLOCK_SIZE - 1); if (post) { len -= post; if (len == 0) { + fscache_count_dio_misfit(); netfs_write_subrequest_terminated(subreq, post, false); return; } diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index 4dd8a993c60a..7c6f260a3be5 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -64,9 +64,15 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object) memcpy(buf->data, fscache_get_aux(object->cookie), len); ret = cachefiles_inject_write_error(); - if (ret == 0) - ret = vfs_setxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache, - buf, sizeof(struct cachefiles_xattr) + len, 0); + if (ret == 0) { + ret = mnt_want_write_file(file); + if (ret == 0) { + ret = vfs_setxattr(&nop_mnt_idmap, dentry, + cachefiles_xattr_cache, buf, + sizeof(struct cachefiles_xattr) + len, 0); + mnt_drop_write_file(file); + } + } if (ret < 0) { trace_cachefiles_vfs_error(object, file_inode(file), ret, cachefiles_trace_setxattr_error); @@ -151,8 +157,14 @@ int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, int ret; ret = cachefiles_inject_remove_error(); - if (ret == 0) - ret = vfs_removexattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache); + if (ret == 0) { + ret = mnt_want_write(cache->mnt); + if (ret == 0) { + ret = vfs_removexattr(&nop_mnt_idmap, dentry, + cachefiles_xattr_cache); + mnt_drop_write(cache->mnt); + } + } if (ret < 0) { trace_cachefiles_vfs_error(object, d_inode(dentry), ret, cachefiles_trace_remxattr_error); @@ -208,9 +220,15 @@ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume) memcpy(buf->data, p, volume->vcookie->coherency_len); ret = cachefiles_inject_write_error(); - if (ret == 0) - ret = vfs_setxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache, - buf, len, 0); + if (ret == 0) { + ret = mnt_want_write(volume->cache->mnt); + if (ret == 0) { + ret = vfs_setxattr(&nop_mnt_idmap, dentry, + cachefiles_xattr_cache, + buf, len, 0); + mnt_drop_write(volume->cache->mnt); + } + } if (ret < 0) { trace_cachefiles_vfs_error(NULL, d_inode(dentry), ret, cachefiles_trace_setxattr_error); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index a5f848c167fa..5d9ccda098cc 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "super.h" #include "mds_client.h" @@ -205,21 +206,6 @@ static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq) } } -static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq) -{ - struct inode *inode = subreq->rreq->inode; - struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); - struct ceph_inode_info *ci = ceph_inode(inode); - u64 objno, objoff; - u32 xlen; - - /* Truncate the extent at the end of the current block */ - ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len, - &objno, &objoff, &xlen); - subreq->len = min(xlen, fsc->mount_options->rsize); - return true; -} - static void finish_netfs_read(struct ceph_osd_request *req) { struct inode *inode = req->r_inode; @@ -264,7 +250,12 @@ static void finish_netfs_read(struct ceph_osd_request *req) calc_pages_for(osd_data->alignment, osd_data->length), false); } - netfs_subreq_terminated(subreq, err, false); + if (err > 0) { + subreq->transferred = err; + err = 0; + } + trace_netfs_sreq(subreq, netfs_sreq_trace_io_progress); + netfs_read_subreq_terminated(subreq, err, false); iput(req->r_inode); ceph_dec_osd_stopping_blocker(fsc->mdsc); } @@ -278,7 +269,6 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) struct ceph_mds_request *req; struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_inode_info *ci = ceph_inode(inode); - struct iov_iter iter; ssize_t err = 0; size_t len; int mode; @@ -301,6 +291,7 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA); req->r_num_caps = 2; + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); err = ceph_mdsc_do_request(mdsc, NULL, req); if (err < 0) goto out; @@ -314,17 +305,36 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) } len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len); - iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len); - err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter); - if (err == 0) + err = copy_to_iter(iinfo->inline_data + subreq->start, len, &subreq->io_iter); + if (err == 0) { err = -EFAULT; + } else { + subreq->transferred += err; + err = 0; + } ceph_mdsc_put_request(req); out: - netfs_subreq_terminated(subreq, err, false); + netfs_read_subreq_terminated(subreq, err, false); return true; } +static int ceph_netfs_prepare_read(struct netfs_io_subrequest *subreq) +{ + struct netfs_io_request *rreq = subreq->rreq; + struct inode *inode = rreq->inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + u64 objno, objoff; + u32 xlen; + + /* Truncate the extent at the end of the current block */ + ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len, + &objno, &objoff, &xlen); + rreq->io_streams[0].sreq_max_len = umin(xlen, fsc->mount_options->rsize); + return 0; +} + static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; @@ -334,9 +344,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) struct ceph_client *cl = fsc->client; struct ceph_osd_request *req = NULL; struct ceph_vino vino = ceph_vino(inode); - struct iov_iter iter; - int err = 0; - u64 len = subreq->len; + int err; + u64 len; bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD); u64 off = subreq->start; int extent_cnt; @@ -349,6 +358,12 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq)) return; + // TODO: This rounding here is slightly dodgy. It *should* work, for + // now, as the cache only deals in blocks that are a multiple of + // PAGE_SIZE and fscrypt blocks are at most PAGE_SIZE. What needs to + // happen is for the fscrypt driving to be moved into netfslib and the + // data in the cache also to be stored encrypted. + len = subreq->len; ceph_fscrypt_adjust_off_and_len(inode, &off, &len); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, @@ -371,8 +386,6 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) doutc(cl, "%llx.%llx pos=%llu orig_len=%zu len=%llu\n", ceph_vinop(inode), subreq->start, subreq->len, len); - iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len); - /* * FIXME: For now, use CEPH_OSD_DATA_TYPE_PAGES instead of _ITER for * encrypted inodes. We'd need infrastructure that handles an iov_iter @@ -384,7 +397,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) struct page **pages; size_t page_off; - err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off); + err = iov_iter_get_pages_alloc2(&subreq->io_iter, &pages, len, &page_off); if (err < 0) { doutc(cl, "%llx.%llx failed to allocate pages, %d\n", ceph_vinop(inode), err); @@ -399,7 +412,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); } else { - osd_req_op_extent_osd_iter(req, 0, &iter); + osd_req_op_extent_osd_iter(req, 0, &subreq->io_iter); } if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { err = -EIO; @@ -410,17 +423,19 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) req->r_inode = inode; ihold(inode); + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); ceph_osdc_start_request(req->r_osdc, req); out: ceph_osdc_put_request(req); if (err) - netfs_subreq_terminated(subreq, err, false); + netfs_read_subreq_terminated(subreq, err, false); doutc(cl, "%llx.%llx result %d\n", ceph_vinop(inode), err); } static int ceph_init_request(struct netfs_io_request *rreq, struct file *file) { struct inode *inode = rreq->inode; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_client *cl = ceph_inode_to_client(inode); int got = 0, want = CEPH_CAP_FILE_CACHE; struct ceph_netfs_request_data *priv; @@ -472,6 +487,7 @@ static int ceph_init_request(struct netfs_io_request *rreq, struct file *file) priv->caps = got; rreq->netfs_priv = priv; + rreq->io_streams[0].sreq_max_len = fsc->mount_options->rsize; out: if (ret < 0) @@ -496,9 +512,9 @@ static void ceph_netfs_free_request(struct netfs_io_request *rreq) const struct netfs_request_ops ceph_netfs_ops = { .init_request = ceph_init_request, .free_request = ceph_netfs_free_request, + .prepare_read = ceph_netfs_prepare_read, .issue_read = ceph_netfs_issue_read, .expand_readahead = ceph_netfs_expand_readahead, - .clamp_length = ceph_netfs_clamp_length, .check_write_begin = ceph_netfs_check_write_begin, }; diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile index 8e6781e0b10b..d08b0bfb6756 100644 --- a/fs/netfs/Makefile +++ b/fs/netfs/Makefile @@ -5,12 +5,14 @@ netfs-y := \ buffered_write.o \ direct_read.o \ direct_write.o \ - io.o \ iterator.o \ locking.o \ main.o \ misc.o \ objects.o \ + read_collect.o \ + read_pgpriv2.o \ + read_retry.o \ write_collect.o \ write_issue.o diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 27c750d39476..c40e226053cc 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -9,214 +9,6 @@ #include #include "internal.h" -/* - * [DEPRECATED] Unlock the folios in a read operation for when the filesystem - * is using PG_private_2 and direct writing to the cache from here rather than - * marking the page for writeback. - * - * Note that we don't touch folio->private in this code. - */ -static void netfs_rreq_unlock_folios_pgpriv2(struct netfs_io_request *rreq, - size_t *account) -{ - struct netfs_io_subrequest *subreq; - struct folio *folio; - pgoff_t start_page = rreq->start / PAGE_SIZE; - pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; - bool subreq_failed = false; - - XA_STATE(xas, &rreq->mapping->i_pages, start_page); - - /* Walk through the pagecache and the I/O request lists simultaneously. - * We may have a mixture of cached and uncached sections and we only - * really want to write out the uncached sections. This is slightly - * complicated by the possibility that we might have huge pages with a - * mixture inside. - */ - subreq = list_first_entry(&rreq->subrequests, - struct netfs_io_subrequest, rreq_link); - subreq_failed = (subreq->error < 0); - - trace_netfs_rreq(rreq, netfs_rreq_trace_unlock_pgpriv2); - - rcu_read_lock(); - xas_for_each(&xas, folio, last_page) { - loff_t pg_end; - bool pg_failed = false; - bool folio_started = false; - - if (xas_retry(&xas, folio)) - continue; - - pg_end = folio_pos(folio) + folio_size(folio) - 1; - - for (;;) { - loff_t sreq_end; - - if (!subreq) { - pg_failed = true; - break; - } - - if (!folio_started && - test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags) && - fscache_operation_valid(&rreq->cache_resources)) { - trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); - folio_start_private_2(folio); - folio_started = true; - } - - pg_failed |= subreq_failed; - sreq_end = subreq->start + subreq->len - 1; - if (pg_end < sreq_end) - break; - - *account += subreq->transferred; - if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { - subreq = list_next_entry(subreq, rreq_link); - subreq_failed = (subreq->error < 0); - } else { - subreq = NULL; - subreq_failed = false; - } - - if (pg_end == sreq_end) - break; - } - - if (!pg_failed) { - flush_dcache_folio(folio); - folio_mark_uptodate(folio); - } - - if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { - if (folio->index == rreq->no_unlock_folio && - test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) - _debug("no unlock"); - else - folio_unlock(folio); - } - } - rcu_read_unlock(); -} - -/* - * Unlock the folios in a read operation. We need to set PG_writeback on any - * folios we're going to write back before we unlock them. - * - * Note that if the deprecated NETFS_RREQ_USE_PGPRIV2 is set then we use - * PG_private_2 and do a direct write to the cache from here instead. - */ -void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) -{ - struct netfs_io_subrequest *subreq; - struct netfs_folio *finfo; - struct folio *folio; - pgoff_t start_page = rreq->start / PAGE_SIZE; - pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; - size_t account = 0; - bool subreq_failed = false; - - XA_STATE(xas, &rreq->mapping->i_pages, start_page); - - if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) { - __clear_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags); - list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { - __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); - } - } - - /* Handle deprecated PG_private_2 case. */ - if (test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) { - netfs_rreq_unlock_folios_pgpriv2(rreq, &account); - goto out; - } - - /* Walk through the pagecache and the I/O request lists simultaneously. - * We may have a mixture of cached and uncached sections and we only - * really want to write out the uncached sections. This is slightly - * complicated by the possibility that we might have huge pages with a - * mixture inside. - */ - subreq = list_first_entry(&rreq->subrequests, - struct netfs_io_subrequest, rreq_link); - subreq_failed = (subreq->error < 0); - - trace_netfs_rreq(rreq, netfs_rreq_trace_unlock); - - rcu_read_lock(); - xas_for_each(&xas, folio, last_page) { - loff_t pg_end; - bool pg_failed = false; - bool wback_to_cache = false; - - if (xas_retry(&xas, folio)) - continue; - - pg_end = folio_pos(folio) + folio_size(folio) - 1; - - for (;;) { - loff_t sreq_end; - - if (!subreq) { - pg_failed = true; - break; - } - - wback_to_cache |= test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); - pg_failed |= subreq_failed; - sreq_end = subreq->start + subreq->len - 1; - if (pg_end < sreq_end) - break; - - account += subreq->transferred; - if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { - subreq = list_next_entry(subreq, rreq_link); - subreq_failed = (subreq->error < 0); - } else { - subreq = NULL; - subreq_failed = false; - } - - if (pg_end == sreq_end) - break; - } - - if (!pg_failed) { - flush_dcache_folio(folio); - finfo = netfs_folio_info(folio); - if (finfo) { - trace_netfs_folio(folio, netfs_folio_trace_filled_gaps); - if (finfo->netfs_group) - folio_change_private(folio, finfo->netfs_group); - else - folio_detach_private(folio); - kfree(finfo); - } - folio_mark_uptodate(folio); - if (wback_to_cache && !WARN_ON_ONCE(folio_get_private(folio) != NULL)) { - trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); - folio_attach_private(folio, NETFS_FOLIO_COPY_TO_CACHE); - filemap_dirty_folio(folio->mapping, folio); - } - } - - if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { - if (folio->index == rreq->no_unlock_folio && - test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) - _debug("no unlock"); - else - folio_unlock(folio); - } - } - rcu_read_unlock(); - -out: - task_io_account_read(account); - if (rreq->netfs_ops->done) - rreq->netfs_ops->done(rreq); -} - static void netfs_cache_expand_readahead(struct netfs_io_request *rreq, unsigned long long *_start, unsigned long long *_len, @@ -271,6 +63,336 @@ static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_in return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx)); } +/* + * Decant the list of folios to read into a rolling buffer. + */ +static size_t netfs_load_buffer_from_ra(struct netfs_io_request *rreq, + struct folio_queue *folioq) +{ + unsigned int order, nr; + size_t size = 0; + + nr = __readahead_batch(rreq->ractl, (struct page **)folioq->vec.folios, + ARRAY_SIZE(folioq->vec.folios)); + folioq->vec.nr = nr; + for (int i = 0; i < nr; i++) { + struct folio *folio = folioq_folio(folioq, i); + + trace_netfs_folio(folio, netfs_folio_trace_read); + order = folio_order(folio); + folioq->orders[i] = order; + size += PAGE_SIZE << order; + } + + for (int i = nr; i < folioq_nr_slots(folioq); i++) + folioq_clear(folioq, i); + + return size; +} + +/* + * netfs_prepare_read_iterator - Prepare the subreq iterator for I/O + * @subreq: The subrequest to be set up + * + * Prepare the I/O iterator representing the read buffer on a subrequest for + * the filesystem to use for I/O (it can be passed directly to a socket). This + * is intended to be called from the ->issue_read() method once the filesystem + * has trimmed the request to the size it wants. + * + * Returns the limited size if successful and -ENOMEM if insufficient memory + * available. + * + * [!] NOTE: This must be run in the same thread as ->issue_read() was called + * in as we access the readahead_control struct. + */ +static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq) +{ + struct netfs_io_request *rreq = subreq->rreq; + size_t rsize = subreq->len; + + if (subreq->source == NETFS_DOWNLOAD_FROM_SERVER) + rsize = umin(rsize, rreq->io_streams[0].sreq_max_len); + + if (rreq->ractl) { + /* If we don't have sufficient folios in the rolling buffer, + * extract a folioq's worth from the readahead region at a time + * into the buffer. Note that this acquires a ref on each page + * that we will need to release later - but we don't want to do + * that until after we've started the I/O. + */ + while (rreq->submitted < subreq->start + rsize) { + struct folio_queue *tail = rreq->buffer_tail, *new; + size_t added; + + new = kmalloc(sizeof(*new), GFP_NOFS); + if (!new) + return -ENOMEM; + netfs_stat(&netfs_n_folioq); + folioq_init(new); + new->prev = tail; + tail->next = new; + rreq->buffer_tail = new; + added = netfs_load_buffer_from_ra(rreq, new); + rreq->iter.count += added; + rreq->submitted += added; + } + } + + subreq->len = rsize; + if (unlikely(rreq->io_streams[0].sreq_max_segs)) { + size_t limit = netfs_limit_iter(&rreq->iter, 0, rsize, + rreq->io_streams[0].sreq_max_segs); + + if (limit < rsize) { + subreq->len = limit; + trace_netfs_sreq(subreq, netfs_sreq_trace_limited); + } + } + + subreq->io_iter = rreq->iter; + + if (iov_iter_is_folioq(&subreq->io_iter)) { + if (subreq->io_iter.folioq_slot >= folioq_nr_slots(subreq->io_iter.folioq)) { + subreq->io_iter.folioq = subreq->io_iter.folioq->next; + subreq->io_iter.folioq_slot = 0; + } + subreq->curr_folioq = (struct folio_queue *)subreq->io_iter.folioq; + subreq->curr_folioq_slot = subreq->io_iter.folioq_slot; + subreq->curr_folio_order = subreq->curr_folioq->orders[subreq->curr_folioq_slot]; + } + + iov_iter_truncate(&subreq->io_iter, subreq->len); + iov_iter_advance(&rreq->iter, subreq->len); + return subreq->len; +} + +static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq, + loff_t i_size) +{ + struct netfs_cache_resources *cres = &rreq->cache_resources; + + if (!cres->ops) + return NETFS_DOWNLOAD_FROM_SERVER; + return cres->ops->prepare_read(subreq, i_size); +} + +static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, + bool was_async) +{ + struct netfs_io_subrequest *subreq = priv; + + if (transferred_or_error < 0) { + netfs_read_subreq_terminated(subreq, transferred_or_error, was_async); + return; + } + + if (transferred_or_error > 0) + subreq->transferred += transferred_or_error; + netfs_read_subreq_terminated(subreq, 0, was_async); +} + +/* + * Issue a read against the cache. + * - Eats the caller's ref on subreq. + */ +static void netfs_read_cache_to_pagecache(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq) +{ + struct netfs_cache_resources *cres = &rreq->cache_resources; + + netfs_stat(&netfs_n_rh_read); + cres->ops->read(cres, subreq->start, &subreq->io_iter, NETFS_READ_HOLE_IGNORE, + netfs_cache_read_terminated, subreq); +} + +/* + * Perform a read to the pagecache from a series of sources of different types, + * slicing up the region to be read according to available cache blocks and + * network rsize. + */ +static void netfs_read_to_pagecache(struct netfs_io_request *rreq) +{ + struct netfs_inode *ictx = netfs_inode(rreq->inode); + unsigned long long start = rreq->start; + ssize_t size = rreq->len; + int ret = 0; + + atomic_inc(&rreq->nr_outstanding); + + do { + struct netfs_io_subrequest *subreq; + enum netfs_io_source source = NETFS_DOWNLOAD_FROM_SERVER; + ssize_t slice; + + subreq = netfs_alloc_subrequest(rreq); + if (!subreq) { + ret = -ENOMEM; + break; + } + + subreq->start = start; + subreq->len = size; + + atomic_inc(&rreq->nr_outstanding); + spin_lock_bh(&rreq->lock); + list_add_tail(&subreq->rreq_link, &rreq->subrequests); + subreq->prev_donated = rreq->prev_donated; + rreq->prev_donated = 0; + trace_netfs_sreq(subreq, netfs_sreq_trace_added); + spin_unlock_bh(&rreq->lock); + + source = netfs_cache_prepare_read(rreq, subreq, rreq->i_size); + subreq->source = source; + if (source == NETFS_DOWNLOAD_FROM_SERVER) { + unsigned long long zp = umin(ictx->zero_point, rreq->i_size); + size_t len = subreq->len; + + if (subreq->start >= zp) { + subreq->source = source = NETFS_FILL_WITH_ZEROES; + goto fill_with_zeroes; + } + + if (len > zp - subreq->start) + len = zp - subreq->start; + if (len == 0) { + pr_err("ZERO-LEN READ: R=%08x[%x] l=%zx/%zx s=%llx z=%llx i=%llx", + rreq->debug_id, subreq->debug_index, + subreq->len, size, + subreq->start, ictx->zero_point, rreq->i_size); + break; + } + subreq->len = len; + + netfs_stat(&netfs_n_rh_download); + if (rreq->netfs_ops->prepare_read) { + ret = rreq->netfs_ops->prepare_read(subreq); + if (ret < 0) { + atomic_dec(&rreq->nr_outstanding); + netfs_put_subrequest(subreq, false, + netfs_sreq_trace_put_cancel); + break; + } + trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); + } + + slice = netfs_prepare_read_iterator(subreq); + if (slice < 0) { + atomic_dec(&rreq->nr_outstanding); + netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); + ret = slice; + break; + } + + rreq->netfs_ops->issue_read(subreq); + goto done; + } + + fill_with_zeroes: + if (source == NETFS_FILL_WITH_ZEROES) { + subreq->source = NETFS_FILL_WITH_ZEROES; + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + netfs_stat(&netfs_n_rh_zero); + slice = netfs_prepare_read_iterator(subreq); + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + netfs_read_subreq_terminated(subreq, 0, false); + goto done; + } + + if (source == NETFS_READ_FROM_CACHE) { + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); + slice = netfs_prepare_read_iterator(subreq); + netfs_read_cache_to_pagecache(rreq, subreq); + goto done; + } + + pr_err("Unexpected read source %u\n", source); + WARN_ON_ONCE(1); + break; + + done: + size -= slice; + start += slice; + cond_resched(); + } while (size > 0); + + if (atomic_dec_and_test(&rreq->nr_outstanding)) + netfs_rreq_terminated(rreq, false); + + /* Defer error return as we may need to wait for outstanding I/O. */ + cmpxchg(&rreq->error, 0, ret); +} + +/* + * Wait for the read operation to complete, successfully or otherwise. + */ +static int netfs_wait_for_read(struct netfs_io_request *rreq) +{ + int ret; + + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip); + wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS, TASK_UNINTERRUPTIBLE); + ret = rreq->error; + if (ret == 0 && rreq->submitted < rreq->len) { + trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read); + ret = -EIO; + } + + return ret; +} + +/* + * Set up the initial folioq of buffer folios in the rolling buffer and set the + * iterator to refer to it. + */ +static int netfs_prime_buffer(struct netfs_io_request *rreq) +{ + struct folio_queue *folioq; + size_t added; + + folioq = kmalloc(sizeof(*folioq), GFP_KERNEL); + if (!folioq) + return -ENOMEM; + netfs_stat(&netfs_n_folioq); + folioq_init(folioq); + rreq->buffer = folioq; + rreq->buffer_tail = folioq; + rreq->submitted = rreq->start; + iov_iter_folio_queue(&rreq->iter, ITER_DEST, folioq, 0, 0, 0); + + added = netfs_load_buffer_from_ra(rreq, folioq); + rreq->iter.count += added; + rreq->submitted += added; + return 0; +} + +/* + * Drop the ref on each folio that we inherited from the VM readahead code. We + * still have the folio locks to pin the page until we complete the I/O. + * + * Note that we can't just release the batch in each queue struct as we use the + * occupancy count in other places. + */ +static void netfs_put_ra_refs(struct folio_queue *folioq) +{ + struct folio_batch fbatch; + + folio_batch_init(&fbatch); + while (folioq) { + for (unsigned int slot = 0; slot < folioq_count(folioq); slot++) { + struct folio *folio = folioq_folio(folioq, slot); + if (!folio) + continue; + trace_netfs_folio(folio, netfs_folio_trace_read_put); + if (!folio_batch_add(&fbatch, folio)) + folio_batch_release(&fbatch); + } + folioq = folioq->next; + } + + folio_batch_release(&fbatch); +} + /** * netfs_readahead - Helper to manage a read request * @ractl: The description of the readahead request @@ -289,22 +411,17 @@ static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_in void netfs_readahead(struct readahead_control *ractl) { struct netfs_io_request *rreq; - struct netfs_inode *ctx = netfs_inode(ractl->mapping->host); + struct netfs_inode *ictx = netfs_inode(ractl->mapping->host); + unsigned long long start = readahead_pos(ractl); + size_t size = readahead_length(ractl); int ret; - _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl)); - - if (readahead_count(ractl) == 0) - return; - - rreq = netfs_alloc_request(ractl->mapping, ractl->file, - readahead_pos(ractl), - readahead_length(ractl), + rreq = netfs_alloc_request(ractl->mapping, ractl->file, start, size, NETFS_READAHEAD); if (IS_ERR(rreq)) return; - ret = netfs_begin_cache_read(rreq, ctx); + ret = netfs_begin_cache_read(rreq, ictx); if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) goto cleanup_free; @@ -314,18 +431,15 @@ void netfs_readahead(struct readahead_control *ractl) netfs_rreq_expand(rreq, ractl); - /* Set up the output buffer */ - iov_iter_xarray(&rreq->iter, ITER_DEST, &ractl->mapping->i_pages, - rreq->start, rreq->len); + rreq->ractl = ractl; + if (netfs_prime_buffer(rreq) < 0) + goto cleanup_free; + netfs_read_to_pagecache(rreq); - /* Drop the refs on the folios here rather than in the cache or - * filesystem. The locks will be dropped in netfs_rreq_unlock(). - */ - while (readahead_folio(ractl)) - ; + /* Release the folio refs whilst we're waiting for the I/O. */ + netfs_put_ra_refs(rreq->buffer); - netfs_begin_read(rreq, false); - netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + netfs_put_request(rreq, true, netfs_rreq_trace_put_return); return; cleanup_free: @@ -334,6 +448,117 @@ cleanup_free: } EXPORT_SYMBOL(netfs_readahead); +/* + * Create a rolling buffer with a single occupying folio. + */ +static int netfs_create_singular_buffer(struct netfs_io_request *rreq, struct folio *folio) +{ + struct folio_queue *folioq; + + folioq = kmalloc(sizeof(*folioq), GFP_KERNEL); + if (!folioq) + return -ENOMEM; + + netfs_stat(&netfs_n_folioq); + folioq_init(folioq); + folioq_append(folioq, folio); + BUG_ON(folioq_folio(folioq, 0) != folio); + BUG_ON(folioq_folio_order(folioq, 0) != folio_order(folio)); + rreq->buffer = folioq; + rreq->buffer_tail = folioq; + rreq->submitted = rreq->start + rreq->len; + iov_iter_folio_queue(&rreq->iter, ITER_DEST, folioq, 0, 0, rreq->len); + rreq->ractl = (struct readahead_control *)1UL; + return 0; +} + +/* + * Read into gaps in a folio partially filled by a streaming write. + */ +static int netfs_read_gaps(struct file *file, struct folio *folio) +{ + struct netfs_io_request *rreq; + struct address_space *mapping = folio->mapping; + struct netfs_folio *finfo = netfs_folio_info(folio); + struct netfs_inode *ctx = netfs_inode(mapping->host); + struct folio *sink = NULL; + struct bio_vec *bvec; + unsigned int from = finfo->dirty_offset; + unsigned int to = from + finfo->dirty_len; + unsigned int off = 0, i = 0; + size_t flen = folio_size(folio); + size_t nr_bvec = flen / PAGE_SIZE + 2; + size_t part; + int ret; + + _enter("%lx", folio->index); + + rreq = netfs_alloc_request(mapping, file, folio_pos(folio), flen, NETFS_READ_GAPS); + if (IS_ERR(rreq)) { + ret = PTR_ERR(rreq); + goto alloc_error; + } + + ret = netfs_begin_cache_read(rreq, ctx); + if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) + goto discard; + + netfs_stat(&netfs_n_rh_read_folio); + trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_read_gaps); + + /* Fiddle the buffer so that a gap at the beginning and/or a gap at the + * end get copied to, but the middle is discarded. + */ + ret = -ENOMEM; + bvec = kmalloc_array(nr_bvec, sizeof(*bvec), GFP_KERNEL); + if (!bvec) + goto discard; + + sink = folio_alloc(GFP_KERNEL, 0); + if (!sink) { + kfree(bvec); + goto discard; + } + + trace_netfs_folio(folio, netfs_folio_trace_read_gaps); + + rreq->direct_bv = bvec; + rreq->direct_bv_count = nr_bvec; + if (from > 0) { + bvec_set_folio(&bvec[i++], folio, from, 0); + off = from; + } + while (off < to) { + part = min_t(size_t, to - off, PAGE_SIZE); + bvec_set_folio(&bvec[i++], sink, part, 0); + off += part; + } + if (to < flen) + bvec_set_folio(&bvec[i++], folio, flen - to, to); + iov_iter_bvec(&rreq->iter, ITER_DEST, bvec, i, rreq->len); + rreq->submitted = rreq->start + flen; + + netfs_read_to_pagecache(rreq); + + if (sink) + folio_put(sink); + + ret = netfs_wait_for_read(rreq); + if (ret == 0) { + flush_dcache_folio(folio); + folio_mark_uptodate(folio); + } + folio_unlock(folio); + netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + return ret < 0 ? ret : 0; + +discard: + netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); +alloc_error: + folio_unlock(folio); + return ret; +} + /** * netfs_read_folio - Helper to manage a read_folio request * @file: The file to read from @@ -353,9 +578,13 @@ int netfs_read_folio(struct file *file, struct folio *folio) struct address_space *mapping = folio->mapping; struct netfs_io_request *rreq; struct netfs_inode *ctx = netfs_inode(mapping->host); - struct folio *sink = NULL; int ret; + if (folio_test_dirty(folio)) { + trace_netfs_folio(folio, netfs_folio_trace_read_gaps); + return netfs_read_gaps(file, folio); + } + _enter("%lx", folio->index); rreq = netfs_alloc_request(mapping, file, @@ -374,54 +603,12 @@ int netfs_read_folio(struct file *file, struct folio *folio) trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage); /* Set up the output buffer */ - if (folio_test_dirty(folio)) { - /* Handle someone trying to read from an unflushed streaming - * write. We fiddle the buffer so that a gap at the beginning - * and/or a gap at the end get copied to, but the middle is - * discarded. - */ - struct netfs_folio *finfo = netfs_folio_info(folio); - struct bio_vec *bvec; - unsigned int from = finfo->dirty_offset; - unsigned int to = from + finfo->dirty_len; - unsigned int off = 0, i = 0; - size_t flen = folio_size(folio); - size_t nr_bvec = flen / PAGE_SIZE + 2; - size_t part; + ret = netfs_create_singular_buffer(rreq, folio); + if (ret < 0) + goto discard; - ret = -ENOMEM; - bvec = kmalloc_array(nr_bvec, sizeof(*bvec), GFP_KERNEL); - if (!bvec) - goto discard; - - sink = folio_alloc(GFP_KERNEL, 0); - if (!sink) - goto discard; - - trace_netfs_folio(folio, netfs_folio_trace_read_gaps); - - rreq->direct_bv = bvec; - rreq->direct_bv_count = nr_bvec; - if (from > 0) { - bvec_set_folio(&bvec[i++], folio, from, 0); - off = from; - } - while (off < to) { - part = min_t(size_t, to - off, PAGE_SIZE); - bvec_set_folio(&bvec[i++], sink, part, 0); - off += part; - } - if (to < flen) - bvec_set_folio(&bvec[i++], folio, flen - to, to); - iov_iter_bvec(&rreq->iter, ITER_DEST, bvec, i, rreq->len); - } else { - iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages, - rreq->start, rreq->len); - } - - ret = netfs_begin_read(rreq, true); - if (sink) - folio_put(sink); + netfs_read_to_pagecache(rreq); + ret = netfs_wait_for_read(rreq); netfs_put_request(rreq, false, netfs_rreq_trace_put_return); return ret < 0 ? ret : 0; @@ -494,13 +681,10 @@ zero_out: * * Pre-read data for a write-begin request by drawing data from the cache if * possible, or the netfs if not. Space beyond the EOF is zero-filled. - * Multiple I/O requests from different sources will get munged together. If - * necessary, the readahead window can be expanded in either direction to a - * more convenient alighment for RPC efficiency or to make storage in the cache - * feasible. + * Multiple I/O requests from different sources will get munged together. * * The calling netfs must provide a table of operations, only one of which, - * issue_op, is mandatory. + * issue_read, is mandatory. * * The check_write_begin() operation can be provided to check for and flush * conflicting writes once the folio is grabbed and locked. It is passed a @@ -528,8 +712,6 @@ int netfs_write_begin(struct netfs_inode *ctx, pgoff_t index = pos >> PAGE_SHIFT; int ret; - DEFINE_READAHEAD(ractl, file, NULL, mapping, index); - retry: folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); @@ -577,22 +759,13 @@ retry: netfs_stat(&netfs_n_rh_write_begin); trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin); - /* Expand the request to meet caching requirements and download - * preferences. - */ - ractl._nr_pages = folio_nr_pages(folio); - netfs_rreq_expand(rreq, &ractl); - /* Set up the output buffer */ - iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages, - rreq->start, rreq->len); + ret = netfs_create_singular_buffer(rreq, folio); + if (ret < 0) + goto error_put; - /* We hold the folio locks, so we can drop the references */ - folio_get(folio); - while (readahead_folio(&ractl)) - ; - - ret = netfs_begin_read(rreq, true); + netfs_read_to_pagecache(rreq); + ret = netfs_wait_for_read(rreq); if (ret < 0) goto error; netfs_put_request(rreq, false, netfs_rreq_trace_put_return); @@ -652,10 +825,13 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio, trace_netfs_read(rreq, start, flen, netfs_read_trace_prefetch_for_write); /* Set up the output buffer */ - iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages, - rreq->start, rreq->len); + ret = netfs_create_singular_buffer(rreq, folio); + if (ret < 0) + goto error_put; - ret = netfs_begin_read(rreq, true); + folioq_mark2(rreq->buffer, 0); + netfs_read_to_pagecache(rreq); + ret = netfs_wait_for_read(rreq); netfs_put_request(rreq, false, netfs_rreq_trace_put_return); return ret; diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index ca53c5d1622e..d7eae597e54d 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -13,91 +13,22 @@ #include #include "internal.h" -/* - * Determined write method. Adjust netfs_folio_traces if this is changed. - */ -enum netfs_how_to_modify { - NETFS_FOLIO_IS_UPTODATE, /* Folio is uptodate already */ - NETFS_JUST_PREFETCH, /* We have to read the folio anyway */ - NETFS_WHOLE_FOLIO_MODIFY, /* We're going to overwrite the whole folio */ - NETFS_MODIFY_AND_CLEAR, /* We can assume there is no data to be downloaded. */ - NETFS_STREAMING_WRITE, /* Store incomplete data in non-uptodate page. */ - NETFS_STREAMING_WRITE_CONT, /* Continue streaming write. */ - NETFS_FLUSH_CONTENT, /* Flush incompatible content. */ -}; +static void __netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) +{ + if (netfs_group) + folio_attach_private(folio, netfs_get_group(netfs_group)); +} static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group) { void *priv = folio_get_private(folio); - if (netfs_group && (!priv || priv == NETFS_FOLIO_COPY_TO_CACHE)) - folio_attach_private(folio, netfs_get_group(netfs_group)); - else if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE) - folio_detach_private(folio); -} - -/* - * Decide how we should modify a folio. We might be attempting to do - * write-streaming, in which case we don't want to a local RMW cycle if we can - * avoid it. If we're doing local caching or content crypto, we award that - * priority over avoiding RMW. If the file is open readably, then we also - * assume that we may want to read what we wrote. - */ -static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx, - struct file *file, - struct folio *folio, - void *netfs_group, - size_t flen, - size_t offset, - size_t len, - bool maybe_trouble) -{ - struct netfs_folio *finfo = netfs_folio_info(folio); - struct netfs_group *group = netfs_folio_group(folio); - loff_t pos = folio_pos(folio); - - _enter(""); - - if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE) - return NETFS_FLUSH_CONTENT; - - if (folio_test_uptodate(folio)) - return NETFS_FOLIO_IS_UPTODATE; - - if (pos >= ctx->zero_point) - return NETFS_MODIFY_AND_CLEAR; - - if (!maybe_trouble && offset == 0 && len >= flen) - return NETFS_WHOLE_FOLIO_MODIFY; - - if (file->f_mode & FMODE_READ) - goto no_write_streaming; - - if (netfs_is_cache_enabled(ctx)) { - /* We don't want to get a streaming write on a file that loses - * caching service temporarily because the backing store got - * culled. - */ - goto no_write_streaming; + if (unlikely(priv != netfs_group)) { + if (netfs_group && (!priv || priv == NETFS_FOLIO_COPY_TO_CACHE)) + folio_attach_private(folio, netfs_get_group(netfs_group)); + else if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE) + folio_detach_private(folio); } - - if (!finfo) - return NETFS_STREAMING_WRITE; - - /* We can continue a streaming write only if it continues on from the - * previous. If it overlaps, we must flush lest we suffer a partial - * copy and disjoint dirty regions. - */ - if (offset == finfo->dirty_offset + finfo->dirty_len) - return NETFS_STREAMING_WRITE_CONT; - return NETFS_FLUSH_CONTENT; - -no_write_streaming: - if (finfo) { - netfs_stat(&netfs_n_wh_wstream_conflict); - return NETFS_FLUSH_CONTENT; - } - return NETFS_JUST_PREFETCH; } /* @@ -177,13 +108,10 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, .range_end = iocb->ki_pos + iter->count, }; struct netfs_io_request *wreq = NULL; - struct netfs_folio *finfo; - struct folio *folio, *writethrough = NULL; - enum netfs_how_to_modify howto; - enum netfs_folio_trace trace; + struct folio *folio = NULL, *writethrough = NULL; unsigned int bdp_flags = (iocb->ki_flags & IOCB_NOWAIT) ? BDP_ASYNC : 0; ssize_t written = 0, ret, ret2; - loff_t i_size, pos = iocb->ki_pos, from, to; + loff_t i_size, pos = iocb->ki_pos; size_t max_chunk = mapping_max_folio_size(mapping); bool maybe_trouble = false; @@ -213,15 +141,14 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, } do { + struct netfs_folio *finfo; + struct netfs_group *group; + unsigned long long fpos; size_t flen; size_t offset; /* Offset into pagecache folio */ size_t part; /* Bytes to write to folio */ size_t copied; /* Bytes copied from user */ - ret = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags); - if (unlikely(ret < 0)) - break; - offset = pos & (max_chunk - 1); part = min(max_chunk - offset, iov_iter_count(iter)); @@ -247,7 +174,8 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, } flen = folio_size(folio); - offset = pos & (flen - 1); + fpos = folio_pos(folio); + offset = pos - fpos; part = min_t(size_t, flen - offset, part); /* Wait for writeback to complete. The writeback engine owns @@ -265,71 +193,52 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, goto error_folio_unlock; } - /* See if we need to prefetch the area we're going to modify. - * We need to do this before we get a lock on the folio in case - * there's more than one writer competing for the same cache - * block. + /* Decide how we should modify a folio. We might be attempting + * to do write-streaming, in which case we don't want to a + * local RMW cycle if we can avoid it. If we're doing local + * caching or content crypto, we award that priority over + * avoiding RMW. If the file is open readably, then we also + * assume that we may want to read what we wrote. */ - howto = netfs_how_to_modify(ctx, file, folio, netfs_group, - flen, offset, part, maybe_trouble); - _debug("howto %u", howto); - switch (howto) { - case NETFS_JUST_PREFETCH: - ret = netfs_prefetch_for_write(file, folio, offset, part); - if (ret < 0) { - _debug("prefetch = %zd", ret); - goto error_folio_unlock; - } - break; - case NETFS_FOLIO_IS_UPTODATE: - case NETFS_WHOLE_FOLIO_MODIFY: - case NETFS_STREAMING_WRITE_CONT: - break; - case NETFS_MODIFY_AND_CLEAR: + finfo = netfs_folio_info(folio); + group = netfs_folio_group(folio); + + if (unlikely(group != netfs_group) && + group != NETFS_FOLIO_COPY_TO_CACHE) + goto flush_content; + + if (folio_test_uptodate(folio)) { + if (mapping_writably_mapped(mapping)) + flush_dcache_folio(folio); + copied = copy_folio_from_iter_atomic(folio, offset, part, iter); + if (unlikely(copied == 0)) + goto copy_failed; + netfs_set_group(folio, netfs_group); + trace_netfs_folio(folio, netfs_folio_is_uptodate); + goto copied; + } + + /* If the page is above the zero-point then we assume that the + * server would just return a block of zeros or a short read if + * we try to read it. + */ + if (fpos >= ctx->zero_point) { zero_user_segment(&folio->page, 0, offset); - break; - case NETFS_STREAMING_WRITE: - ret = -EIO; - if (WARN_ON(folio_get_private(folio))) - goto error_folio_unlock; - break; - case NETFS_FLUSH_CONTENT: - trace_netfs_folio(folio, netfs_flush_content); - from = folio_pos(folio); - to = from + folio_size(folio) - 1; - folio_unlock(folio); - folio_put(folio); - ret = filemap_write_and_wait_range(mapping, from, to); - if (ret < 0) - goto error_folio_unlock; - continue; - } - - if (mapping_writably_mapped(mapping)) - flush_dcache_folio(folio); - - copied = copy_folio_from_iter_atomic(folio, offset, part, iter); - - flush_dcache_folio(folio); - - /* Deal with a (partially) failed copy */ - if (copied == 0) { - ret = -EFAULT; - goto error_folio_unlock; - } - - trace = (enum netfs_folio_trace)howto; - switch (howto) { - case NETFS_FOLIO_IS_UPTODATE: - case NETFS_JUST_PREFETCH: - netfs_set_group(folio, netfs_group); - break; - case NETFS_MODIFY_AND_CLEAR: + copied = copy_folio_from_iter_atomic(folio, offset, part, iter); + if (unlikely(copied == 0)) + goto copy_failed; zero_user_segment(&folio->page, offset + copied, flen); - netfs_set_group(folio, netfs_group); + __netfs_set_group(folio, netfs_group); folio_mark_uptodate(folio); - break; - case NETFS_WHOLE_FOLIO_MODIFY: + trace_netfs_folio(folio, netfs_modify_and_clear); + goto copied; + } + + /* See if we can write a whole folio in one go. */ + if (!maybe_trouble && offset == 0 && part >= flen) { + copied = copy_folio_from_iter_atomic(folio, offset, part, iter); + if (unlikely(copied == 0)) + goto copy_failed; if (unlikely(copied < part)) { maybe_trouble = true; iov_iter_revert(iter, copied); @@ -337,16 +246,53 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, folio_unlock(folio); goto retry; } - netfs_set_group(folio, netfs_group); + __netfs_set_group(folio, netfs_group); folio_mark_uptodate(folio); - break; - case NETFS_STREAMING_WRITE: - if (offset == 0 && copied == flen) { - netfs_set_group(folio, netfs_group); - folio_mark_uptodate(folio); - trace = netfs_streaming_filled_page; - break; + trace_netfs_folio(folio, netfs_whole_folio_modify); + goto copied; + } + + /* We don't want to do a streaming write on a file that loses + * caching service temporarily because the backing store got + * culled and we don't really want to get a streaming write on + * a file that's open for reading as ->read_folio() then has to + * be able to flush it. + */ + if ((file->f_mode & FMODE_READ) || + netfs_is_cache_enabled(ctx)) { + if (finfo) { + netfs_stat(&netfs_n_wh_wstream_conflict); + goto flush_content; } + ret = netfs_prefetch_for_write(file, folio, offset, part); + if (ret < 0) { + _debug("prefetch = %zd", ret); + goto error_folio_unlock; + } + /* Note that copy-to-cache may have been set. */ + + copied = copy_folio_from_iter_atomic(folio, offset, part, iter); + if (unlikely(copied == 0)) + goto copy_failed; + netfs_set_group(folio, netfs_group); + trace_netfs_folio(folio, netfs_just_prefetch); + goto copied; + } + + if (!finfo) { + ret = -EIO; + if (WARN_ON(folio_get_private(folio))) + goto error_folio_unlock; + copied = copy_folio_from_iter_atomic(folio, offset, part, iter); + if (unlikely(copied == 0)) + goto copy_failed; + if (offset == 0 && copied == flen) { + __netfs_set_group(folio, netfs_group); + folio_mark_uptodate(folio); + trace_netfs_folio(folio, netfs_streaming_filled_page); + goto copied; + } + finfo = kzalloc(sizeof(*finfo), GFP_KERNEL); if (!finfo) { iov_iter_revert(iter, copied); @@ -358,9 +304,18 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, finfo->dirty_len = copied; folio_attach_private(folio, (void *)((unsigned long)finfo | NETFS_FOLIO_INFO)); - break; - case NETFS_STREAMING_WRITE_CONT: - finfo = netfs_folio_info(folio); + trace_netfs_folio(folio, netfs_streaming_write); + goto copied; + } + + /* We can continue a streaming write only if it continues on + * from the previous. If it overlaps, we must flush lest we + * suffer a partial copy and disjoint dirty regions. + */ + if (offset == finfo->dirty_offset + finfo->dirty_len) { + copied = copy_folio_from_iter_atomic(folio, offset, part, iter); + if (unlikely(copied == 0)) + goto copy_failed; finfo->dirty_len += copied; if (finfo->dirty_offset == 0 && finfo->dirty_len == flen) { if (finfo->netfs_group) @@ -369,17 +324,25 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, folio_detach_private(folio); folio_mark_uptodate(folio); kfree(finfo); - trace = netfs_streaming_cont_filled_page; + trace_netfs_folio(folio, netfs_streaming_cont_filled_page); + } else { + trace_netfs_folio(folio, netfs_streaming_write_cont); } - break; - default: - WARN(true, "Unexpected modify type %u ix=%lx\n", - howto, folio->index); - ret = -EIO; - goto error_folio_unlock; + goto copied; } - trace_netfs_folio(folio, trace); + /* Incompatible write; flush the folio and try again. */ + flush_content: + trace_netfs_folio(folio, netfs_flush_content); + folio_unlock(folio); + folio_put(folio); + ret = filemap_write_and_wait_range(mapping, fpos, fpos + flen - 1); + if (ret < 0) + goto error_folio_unlock; + continue; + + copied: + flush_dcache_folio(folio); /* Update the inode size if we moved the EOF marker */ pos += copied; @@ -401,12 +364,22 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, folio_put(folio); folio = NULL; + ret = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags); + if (unlikely(ret < 0)) + break; + cond_resched(); } while (iov_iter_count(iter)); out: - if (likely(written) && ctx->ops->post_modify) - ctx->ops->post_modify(inode); + if (likely(written)) { + /* Set indication that ctime and mtime got updated in case + * close is deferred. + */ + set_bit(NETFS_ICTX_MODIFIED_ATTR, &ctx->flags); + if (unlikely(ctx->ops->post_modify)) + ctx->ops->post_modify(inode); + } if (unlikely(wreq)) { ret2 = netfs_end_writethrough(wreq, &wbc, writethrough); @@ -421,6 +394,8 @@ out: _leave(" = %zd [%zd]", written, ret); return written ? written : ret; +copy_failed: + ret = -EFAULT; error_folio_unlock: folio_unlock(folio); folio_put(folio); diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c index 10a1e4da6bda..b1a66a6e6bc2 100644 --- a/fs/netfs/direct_read.c +++ b/fs/netfs/direct_read.c @@ -16,6 +16,143 @@ #include #include "internal.h" +static void netfs_prepare_dio_read_iterator(struct netfs_io_subrequest *subreq) +{ + struct netfs_io_request *rreq = subreq->rreq; + size_t rsize; + + rsize = umin(subreq->len, rreq->io_streams[0].sreq_max_len); + subreq->len = rsize; + + if (unlikely(rreq->io_streams[0].sreq_max_segs)) { + size_t limit = netfs_limit_iter(&rreq->iter, 0, rsize, + rreq->io_streams[0].sreq_max_segs); + + if (limit < rsize) { + subreq->len = limit; + trace_netfs_sreq(subreq, netfs_sreq_trace_limited); + } + } + + trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); + + subreq->io_iter = rreq->iter; + iov_iter_truncate(&subreq->io_iter, subreq->len); + iov_iter_advance(&rreq->iter, subreq->len); +} + +/* + * Perform a read to a buffer from the server, slicing up the region to be read + * according to the network rsize. + */ +static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq) +{ + unsigned long long start = rreq->start; + ssize_t size = rreq->len; + int ret = 0; + + atomic_set(&rreq->nr_outstanding, 1); + + do { + struct netfs_io_subrequest *subreq; + ssize_t slice; + + subreq = netfs_alloc_subrequest(rreq); + if (!subreq) { + ret = -ENOMEM; + break; + } + + subreq->source = NETFS_DOWNLOAD_FROM_SERVER; + subreq->start = start; + subreq->len = size; + + atomic_inc(&rreq->nr_outstanding); + spin_lock_bh(&rreq->lock); + list_add_tail(&subreq->rreq_link, &rreq->subrequests); + subreq->prev_donated = rreq->prev_donated; + rreq->prev_donated = 0; + trace_netfs_sreq(subreq, netfs_sreq_trace_added); + spin_unlock_bh(&rreq->lock); + + netfs_stat(&netfs_n_rh_download); + if (rreq->netfs_ops->prepare_read) { + ret = rreq->netfs_ops->prepare_read(subreq); + if (ret < 0) { + atomic_dec(&rreq->nr_outstanding); + netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); + break; + } + } + + netfs_prepare_dio_read_iterator(subreq); + slice = subreq->len; + rreq->netfs_ops->issue_read(subreq); + + size -= slice; + start += slice; + rreq->submitted += slice; + + if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) && + test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags)) + break; + cond_resched(); + } while (size > 0); + + if (atomic_dec_and_test(&rreq->nr_outstanding)) + netfs_rreq_terminated(rreq, false); + return ret; +} + +/* + * Perform a read to an application buffer, bypassing the pagecache and the + * local disk cache. + */ +static int netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync) +{ + int ret; + + _enter("R=%x %llx-%llx", + rreq->debug_id, rreq->start, rreq->start + rreq->len - 1); + + if (rreq->len == 0) { + pr_err("Zero-sized read [R=%x]\n", rreq->debug_id); + return -EIO; + } + + // TODO: Use bounce buffer if requested + + inode_dio_begin(rreq->inode); + + ret = netfs_dispatch_unbuffered_reads(rreq); + + if (!rreq->submitted) { + netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit); + inode_dio_end(rreq->inode); + ret = 0; + goto out; + } + + if (sync) { + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip); + wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS, + TASK_UNINTERRUPTIBLE); + + ret = rreq->error; + if (ret == 0 && rreq->submitted < rreq->len && + rreq->origin != NETFS_DIO_READ) { + trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read); + ret = -EIO; + } + } else { + ret = -EIOCBQUEUED; + } + +out: + _leave(" = %d", ret); + return ret; +} + /** * netfs_unbuffered_read_iter_locked - Perform an unbuffered or direct I/O read * @iocb: The I/O control descriptor describing the read @@ -31,7 +168,7 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i struct netfs_io_request *rreq; ssize_t ret; size_t orig_count = iov_iter_count(iter); - bool async = !is_sync_kiocb(iocb); + bool sync = is_sync_kiocb(iocb); _enter(""); @@ -78,13 +215,13 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i // TODO: Set up bounce buffer if needed - if (async) + if (!sync) rreq->iocb = iocb; - ret = netfs_begin_read(rreq, is_sync_kiocb(iocb)); + ret = netfs_unbuffered_read(rreq, sync); if (ret < 0) goto out; /* May be -EIOCBQUEUED */ - if (!async) { + if (sync) { // TODO: Copy from bounce buffer iocb->ki_pos += rreq->transferred; ret = rreq->transferred; @@ -94,8 +231,6 @@ out: netfs_put_request(rreq, false, netfs_rreq_trace_put_return); if (ret > 0) orig_count -= ret; - if (ret != -EIOCBQUEUED) - iov_iter_revert(iter, orig_count - iov_iter_count(iter)); return ret; } EXPORT_SYMBOL(netfs_unbuffered_read_iter_locked); diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index 7773f3d855a9..c9f0ed24cb7b 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -22,15 +23,9 @@ /* * buffered_read.c */ -void netfs_rreq_unlock_folios(struct netfs_io_request *rreq); int netfs_prefetch_for_write(struct file *file, struct folio *folio, size_t offset, size_t len); -/* - * io.c - */ -int netfs_begin_read(struct netfs_io_request *rreq, bool sync); - /* * main.c */ @@ -63,6 +58,11 @@ static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {} /* * misc.c */ +int netfs_buffer_append_folio(struct netfs_io_request *rreq, struct folio *folio, + bool needs_put); +struct folio_queue *netfs_delete_buffer_head(struct netfs_io_request *wreq); +void netfs_clear_buffer(struct netfs_io_request *rreq); +void netfs_reset_iter(struct netfs_io_subrequest *subreq); /* * objects.c @@ -83,6 +83,28 @@ static inline void netfs_see_request(struct netfs_io_request *rreq, trace_netfs_rreq_ref(rreq->debug_id, refcount_read(&rreq->ref), what); } +/* + * read_collect.c + */ +void netfs_read_termination_worker(struct work_struct *work); +void netfs_rreq_terminated(struct netfs_io_request *rreq, bool was_async); + +/* + * read_pgpriv2.c + */ +void netfs_pgpriv2_mark_copy_to_cache(struct netfs_io_subrequest *subreq, + struct netfs_io_request *rreq, + struct folio_queue *folioq, + int slot); +void netfs_pgpriv2_write_to_the_cache(struct netfs_io_request *rreq); +bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq); + +/* + * read_retry.c + */ +void netfs_retry_reads(struct netfs_io_request *rreq); +void netfs_unlock_abandoned_read_pages(struct netfs_io_request *rreq); + /* * stats.c */ @@ -110,6 +132,7 @@ extern atomic_t netfs_n_wh_buffered_write; extern atomic_t netfs_n_wh_writethrough; extern atomic_t netfs_n_wh_dio_write; extern atomic_t netfs_n_wh_writepages; +extern atomic_t netfs_n_wh_copy_to_cache; extern atomic_t netfs_n_wh_wstream_conflict; extern atomic_t netfs_n_wh_upload; extern atomic_t netfs_n_wh_upload_done; @@ -117,6 +140,9 @@ extern atomic_t netfs_n_wh_upload_failed; extern atomic_t netfs_n_wh_write; extern atomic_t netfs_n_wh_write_done; extern atomic_t netfs_n_wh_write_failed; +extern atomic_t netfs_n_wb_lock_skip; +extern atomic_t netfs_n_wb_lock_wait; +extern atomic_t netfs_n_folioq; int netfs_stats_show(struct seq_file *m, void *v); @@ -150,7 +176,10 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, loff_t start, enum netfs_io_origin origin); void netfs_reissue_write(struct netfs_io_stream *stream, - struct netfs_io_subrequest *subreq); + struct netfs_io_subrequest *subreq, + struct iov_iter *source); +void netfs_issue_write(struct netfs_io_request *wreq, + struct netfs_io_stream *stream); int netfs_advance_write(struct netfs_io_request *wreq, struct netfs_io_stream *stream, loff_t start, size_t len, bool to_eof); diff --git a/fs/netfs/io.c b/fs/netfs/io.c deleted file mode 100644 index d6ada4eba744..000000000000 --- a/fs/netfs/io.c +++ /dev/null @@ -1,804 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* Network filesystem high-level read support. - * - * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "internal.h" - -/* - * Clear the unread part of an I/O request. - */ -static void netfs_clear_unread(struct netfs_io_subrequest *subreq) -{ - iov_iter_zero(iov_iter_count(&subreq->io_iter), &subreq->io_iter); -} - -static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, - bool was_async) -{ - struct netfs_io_subrequest *subreq = priv; - - netfs_subreq_terminated(subreq, transferred_or_error, was_async); -} - -/* - * Issue a read against the cache. - * - Eats the caller's ref on subreq. - */ -static void netfs_read_from_cache(struct netfs_io_request *rreq, - struct netfs_io_subrequest *subreq, - enum netfs_read_from_hole read_hole) -{ - struct netfs_cache_resources *cres = &rreq->cache_resources; - - netfs_stat(&netfs_n_rh_read); - cres->ops->read(cres, subreq->start, &subreq->io_iter, read_hole, - netfs_cache_read_terminated, subreq); -} - -/* - * Fill a subrequest region with zeroes. - */ -static void netfs_fill_with_zeroes(struct netfs_io_request *rreq, - struct netfs_io_subrequest *subreq) -{ - netfs_stat(&netfs_n_rh_zero); - __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); - netfs_subreq_terminated(subreq, 0, false); -} - -/* - * Ask the netfs to issue a read request to the server for us. - * - * The netfs is expected to read from subreq->pos + subreq->transferred to - * subreq->pos + subreq->len - 1. It may not backtrack and write data into the - * buffer prior to the transferred point as it might clobber dirty data - * obtained from the cache. - * - * Alternatively, the netfs is allowed to indicate one of two things: - * - * - NETFS_SREQ_SHORT_READ: A short read - it will get called again to try and - * make progress. - * - * - NETFS_SREQ_CLEAR_TAIL: A short read - the rest of the buffer will be - * cleared. - */ -static void netfs_read_from_server(struct netfs_io_request *rreq, - struct netfs_io_subrequest *subreq) -{ - netfs_stat(&netfs_n_rh_download); - - if (rreq->origin != NETFS_DIO_READ && - iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred) - pr_warn("R=%08x[%u] ITER PRE-MISMATCH %zx != %zx-%zx %lx\n", - rreq->debug_id, subreq->debug_index, - iov_iter_count(&subreq->io_iter), subreq->len, - subreq->transferred, subreq->flags); - rreq->netfs_ops->issue_read(subreq); -} - -/* - * Release those waiting. - */ -static void netfs_rreq_completed(struct netfs_io_request *rreq, bool was_async) -{ - trace_netfs_rreq(rreq, netfs_rreq_trace_done); - netfs_clear_subrequests(rreq, was_async); - netfs_put_request(rreq, was_async, netfs_rreq_trace_put_complete); -} - -/* - * [DEPRECATED] Deal with the completion of writing the data to the cache. We - * have to clear the PG_fscache bits on the folios involved and release the - * caller's ref. - * - * May be called in softirq mode and we inherit a ref from the caller. - */ -static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq, - bool was_async) -{ - struct netfs_io_subrequest *subreq; - struct folio *folio; - pgoff_t unlocked = 0; - bool have_unlocked = false; - - rcu_read_lock(); - - list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { - XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE); - - xas_for_each(&xas, folio, (subreq->start + subreq->len - 1) / PAGE_SIZE) { - if (xas_retry(&xas, folio)) - continue; - - /* We might have multiple writes from the same huge - * folio, but we mustn't unlock a folio more than once. - */ - if (have_unlocked && folio->index <= unlocked) - continue; - unlocked = folio_next_index(folio) - 1; - trace_netfs_folio(folio, netfs_folio_trace_end_copy); - folio_end_private_2(folio); - have_unlocked = true; - } - } - - rcu_read_unlock(); - netfs_rreq_completed(rreq, was_async); -} - -static void netfs_rreq_copy_terminated(void *priv, ssize_t transferred_or_error, - bool was_async) /* [DEPRECATED] */ -{ - struct netfs_io_subrequest *subreq = priv; - struct netfs_io_request *rreq = subreq->rreq; - - if (IS_ERR_VALUE(transferred_or_error)) { - netfs_stat(&netfs_n_rh_write_failed); - trace_netfs_failure(rreq, subreq, transferred_or_error, - netfs_fail_copy_to_cache); - } else { - netfs_stat(&netfs_n_rh_write_done); - } - - trace_netfs_sreq(subreq, netfs_sreq_trace_write_term); - - /* If we decrement nr_copy_ops to 0, the ref belongs to us. */ - if (atomic_dec_and_test(&rreq->nr_copy_ops)) - netfs_rreq_unmark_after_write(rreq, was_async); - - netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated); -} - -/* - * [DEPRECATED] Perform any outstanding writes to the cache. We inherit a ref - * from the caller. - */ -static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq) -{ - struct netfs_cache_resources *cres = &rreq->cache_resources; - struct netfs_io_subrequest *subreq, *next, *p; - struct iov_iter iter; - int ret; - - trace_netfs_rreq(rreq, netfs_rreq_trace_copy); - - /* We don't want terminating writes trying to wake us up whilst we're - * still going through the list. - */ - atomic_inc(&rreq->nr_copy_ops); - - list_for_each_entry_safe(subreq, p, &rreq->subrequests, rreq_link) { - if (!test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) { - list_del_init(&subreq->rreq_link); - netfs_put_subrequest(subreq, false, - netfs_sreq_trace_put_no_copy); - } - } - - list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { - /* Amalgamate adjacent writes */ - while (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { - next = list_next_entry(subreq, rreq_link); - if (next->start != subreq->start + subreq->len) - break; - subreq->len += next->len; - list_del_init(&next->rreq_link); - netfs_put_subrequest(next, false, - netfs_sreq_trace_put_merged); - } - - ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len, - subreq->len, rreq->i_size, true); - if (ret < 0) { - trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write); - trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip); - continue; - } - - iov_iter_xarray(&iter, ITER_SOURCE, &rreq->mapping->i_pages, - subreq->start, subreq->len); - - atomic_inc(&rreq->nr_copy_ops); - netfs_stat(&netfs_n_rh_write); - netfs_get_subrequest(subreq, netfs_sreq_trace_get_copy_to_cache); - trace_netfs_sreq(subreq, netfs_sreq_trace_write); - cres->ops->write(cres, subreq->start, &iter, - netfs_rreq_copy_terminated, subreq); - } - - /* If we decrement nr_copy_ops to 0, the usage ref belongs to us. */ - if (atomic_dec_and_test(&rreq->nr_copy_ops)) - netfs_rreq_unmark_after_write(rreq, false); -} - -static void netfs_rreq_write_to_cache_work(struct work_struct *work) /* [DEPRECATED] */ -{ - struct netfs_io_request *rreq = - container_of(work, struct netfs_io_request, work); - - netfs_rreq_do_write_to_cache(rreq); -} - -static void netfs_rreq_write_to_cache(struct netfs_io_request *rreq) /* [DEPRECATED] */ -{ - rreq->work.func = netfs_rreq_write_to_cache_work; - if (!queue_work(system_unbound_wq, &rreq->work)) - BUG(); -} - -/* - * Handle a short read. - */ -static void netfs_rreq_short_read(struct netfs_io_request *rreq, - struct netfs_io_subrequest *subreq) -{ - __clear_bit(NETFS_SREQ_SHORT_IO, &subreq->flags); - __set_bit(NETFS_SREQ_SEEK_DATA_READ, &subreq->flags); - - netfs_stat(&netfs_n_rh_short_read); - trace_netfs_sreq(subreq, netfs_sreq_trace_resubmit_short); - - netfs_get_subrequest(subreq, netfs_sreq_trace_get_short_read); - atomic_inc(&rreq->nr_outstanding); - if (subreq->source == NETFS_READ_FROM_CACHE) - netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_CLEAR); - else - netfs_read_from_server(rreq, subreq); -} - -/* - * Reset the subrequest iterator prior to resubmission. - */ -static void netfs_reset_subreq_iter(struct netfs_io_request *rreq, - struct netfs_io_subrequest *subreq) -{ - size_t remaining = subreq->len - subreq->transferred; - size_t count = iov_iter_count(&subreq->io_iter); - - if (count == remaining) - return; - - _debug("R=%08x[%u] ITER RESUB-MISMATCH %zx != %zx-%zx-%llx %x", - rreq->debug_id, subreq->debug_index, - iov_iter_count(&subreq->io_iter), subreq->transferred, - subreq->len, rreq->i_size, - subreq->io_iter.iter_type); - - if (count < remaining) - iov_iter_revert(&subreq->io_iter, remaining - count); - else - iov_iter_advance(&subreq->io_iter, count - remaining); -} - -/* - * Resubmit any short or failed operations. Returns true if we got the rreq - * ref back. - */ -static bool netfs_rreq_perform_resubmissions(struct netfs_io_request *rreq) -{ - struct netfs_io_subrequest *subreq; - - WARN_ON(in_interrupt()); - - trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit); - - /* We don't want terminating submissions trying to wake us up whilst - * we're still going through the list. - */ - atomic_inc(&rreq->nr_outstanding); - - __clear_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); - list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { - if (subreq->error) { - if (subreq->source != NETFS_READ_FROM_CACHE) - break; - subreq->source = NETFS_DOWNLOAD_FROM_SERVER; - subreq->error = 0; - __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); - netfs_stat(&netfs_n_rh_download_instead); - trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead); - netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); - atomic_inc(&rreq->nr_outstanding); - netfs_reset_subreq_iter(rreq, subreq); - netfs_read_from_server(rreq, subreq); - } else if (test_bit(NETFS_SREQ_SHORT_IO, &subreq->flags)) { - __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); - netfs_reset_subreq_iter(rreq, subreq); - netfs_rreq_short_read(rreq, subreq); - } - } - - /* If we decrement nr_outstanding to 0, the usage ref belongs to us. */ - if (atomic_dec_and_test(&rreq->nr_outstanding)) - return true; - - wake_up_var(&rreq->nr_outstanding); - return false; -} - -/* - * Check to see if the data read is still valid. - */ -static void netfs_rreq_is_still_valid(struct netfs_io_request *rreq) -{ - struct netfs_io_subrequest *subreq; - - if (!rreq->netfs_ops->is_still_valid || - rreq->netfs_ops->is_still_valid(rreq)) - return; - - list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { - if (subreq->source == NETFS_READ_FROM_CACHE) { - subreq->error = -ESTALE; - __set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); - } - } -} - -/* - * Determine how much we can admit to having read from a DIO read. - */ -static void netfs_rreq_assess_dio(struct netfs_io_request *rreq) -{ - struct netfs_io_subrequest *subreq; - unsigned int i; - size_t transferred = 0; - - for (i = 0; i < rreq->direct_bv_count; i++) { - flush_dcache_page(rreq->direct_bv[i].bv_page); - // TODO: cifs marks pages in the destination buffer - // dirty under some circumstances after a read. Do we - // need to do that too? - set_page_dirty(rreq->direct_bv[i].bv_page); - } - - list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { - if (subreq->error || subreq->transferred == 0) - break; - transferred += subreq->transferred; - if (subreq->transferred < subreq->len || - test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags)) - break; - } - - for (i = 0; i < rreq->direct_bv_count; i++) - flush_dcache_page(rreq->direct_bv[i].bv_page); - - rreq->transferred = transferred; - task_io_account_read(transferred); - - if (rreq->iocb) { - rreq->iocb->ki_pos += transferred; - if (rreq->iocb->ki_complete) - rreq->iocb->ki_complete( - rreq->iocb, rreq->error ? rreq->error : transferred); - } - if (rreq->netfs_ops->done) - rreq->netfs_ops->done(rreq); - inode_dio_end(rreq->inode); -} - -/* - * Assess the state of a read request and decide what to do next. - * - * Note that we could be in an ordinary kernel thread, on a workqueue or in - * softirq context at this point. We inherit a ref from the caller. - */ -static void netfs_rreq_assess(struct netfs_io_request *rreq, bool was_async) -{ - trace_netfs_rreq(rreq, netfs_rreq_trace_assess); - -again: - netfs_rreq_is_still_valid(rreq); - - if (!test_bit(NETFS_RREQ_FAILED, &rreq->flags) && - test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags)) { - if (netfs_rreq_perform_resubmissions(rreq)) - goto again; - return; - } - - if (rreq->origin != NETFS_DIO_READ) - netfs_rreq_unlock_folios(rreq); - else - netfs_rreq_assess_dio(rreq); - - trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip); - clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags); - wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS); - - if (test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags) && - test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) - return netfs_rreq_write_to_cache(rreq); - - netfs_rreq_completed(rreq, was_async); -} - -static void netfs_rreq_work(struct work_struct *work) -{ - struct netfs_io_request *rreq = - container_of(work, struct netfs_io_request, work); - netfs_rreq_assess(rreq, false); -} - -/* - * Handle the completion of all outstanding I/O operations on a read request. - * We inherit a ref from the caller. - */ -static void netfs_rreq_terminated(struct netfs_io_request *rreq, - bool was_async) -{ - if (test_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags) && - was_async) { - if (!queue_work(system_unbound_wq, &rreq->work)) - BUG(); - } else { - netfs_rreq_assess(rreq, was_async); - } -} - -/** - * netfs_subreq_terminated - Note the termination of an I/O operation. - * @subreq: The I/O request that has terminated. - * @transferred_or_error: The amount of data transferred or an error code. - * @was_async: The termination was asynchronous - * - * This tells the read helper that a contributory I/O operation has terminated, - * one way or another, and that it should integrate the results. - * - * The caller indicates in @transferred_or_error the outcome of the operation, - * supplying a positive value to indicate the number of bytes transferred, 0 to - * indicate a failure to transfer anything that should be retried or a negative - * error code. The helper will look after reissuing I/O operations as - * appropriate and writing downloaded data to the cache. - * - * If @was_async is true, the caller might be running in softirq or interrupt - * context and we can't sleep. - */ -void netfs_subreq_terminated(struct netfs_io_subrequest *subreq, - ssize_t transferred_or_error, - bool was_async) -{ - struct netfs_io_request *rreq = subreq->rreq; - int u; - - _enter("R=%x[%x]{%llx,%lx},%zd", - rreq->debug_id, subreq->debug_index, - subreq->start, subreq->flags, transferred_or_error); - - switch (subreq->source) { - case NETFS_READ_FROM_CACHE: - netfs_stat(&netfs_n_rh_read_done); - break; - case NETFS_DOWNLOAD_FROM_SERVER: - netfs_stat(&netfs_n_rh_download_done); - break; - default: - break; - } - - if (IS_ERR_VALUE(transferred_or_error)) { - subreq->error = transferred_or_error; - trace_netfs_failure(rreq, subreq, transferred_or_error, - netfs_fail_read); - goto failed; - } - - if (WARN(transferred_or_error > subreq->len - subreq->transferred, - "Subreq overread: R%x[%x] %zd > %zu - %zu", - rreq->debug_id, subreq->debug_index, - transferred_or_error, subreq->len, subreq->transferred)) - transferred_or_error = subreq->len - subreq->transferred; - - subreq->error = 0; - subreq->transferred += transferred_or_error; - if (subreq->transferred < subreq->len && - !test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags)) - goto incomplete; - -complete: - __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); - if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) - set_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags); - -out: - trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); - - /* If we decrement nr_outstanding to 0, the ref belongs to us. */ - u = atomic_dec_return(&rreq->nr_outstanding); - if (u == 0) - netfs_rreq_terminated(rreq, was_async); - else if (u == 1) - wake_up_var(&rreq->nr_outstanding); - - netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated); - return; - -incomplete: - if (test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags)) { - netfs_clear_unread(subreq); - subreq->transferred = subreq->len; - goto complete; - } - - if (transferred_or_error == 0) { - if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) { - if (rreq->origin != NETFS_DIO_READ) - subreq->error = -ENODATA; - goto failed; - } - } else { - __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); - } - - __set_bit(NETFS_SREQ_SHORT_IO, &subreq->flags); - set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); - goto out; - -failed: - if (subreq->source == NETFS_READ_FROM_CACHE) { - netfs_stat(&netfs_n_rh_read_failed); - set_bit(NETFS_RREQ_INCOMPLETE_IO, &rreq->flags); - } else { - netfs_stat(&netfs_n_rh_download_failed); - set_bit(NETFS_RREQ_FAILED, &rreq->flags); - rreq->error = subreq->error; - } - goto out; -} -EXPORT_SYMBOL(netfs_subreq_terminated); - -static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_subrequest *subreq, - loff_t i_size) -{ - struct netfs_io_request *rreq = subreq->rreq; - struct netfs_cache_resources *cres = &rreq->cache_resources; - - if (cres->ops) - return cres->ops->prepare_read(subreq, i_size); - if (subreq->start >= rreq->i_size) - return NETFS_FILL_WITH_ZEROES; - return NETFS_DOWNLOAD_FROM_SERVER; -} - -/* - * Work out what sort of subrequest the next one will be. - */ -static enum netfs_io_source -netfs_rreq_prepare_read(struct netfs_io_request *rreq, - struct netfs_io_subrequest *subreq, - struct iov_iter *io_iter) -{ - enum netfs_io_source source = NETFS_DOWNLOAD_FROM_SERVER; - struct netfs_inode *ictx = netfs_inode(rreq->inode); - size_t lsize; - - _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size); - - if (rreq->origin != NETFS_DIO_READ) { - source = netfs_cache_prepare_read(subreq, rreq->i_size); - if (source == NETFS_INVALID_READ) - goto out; - } - - if (source == NETFS_DOWNLOAD_FROM_SERVER) { - /* Call out to the netfs to let it shrink the request to fit - * its own I/O sizes and boundaries. If it shinks it here, it - * will be called again to make simultaneous calls; if it wants - * to make serial calls, it can indicate a short read and then - * we will call it again. - */ - if (rreq->origin != NETFS_DIO_READ) { - if (subreq->start >= ictx->zero_point) { - source = NETFS_FILL_WITH_ZEROES; - goto set; - } - if (subreq->len > ictx->zero_point - subreq->start) - subreq->len = ictx->zero_point - subreq->start; - - /* We limit buffered reads to the EOF, but let the - * server deal with larger-than-EOF DIO/unbuffered - * reads. - */ - if (subreq->len > rreq->i_size - subreq->start) - subreq->len = rreq->i_size - subreq->start; - } - if (rreq->rsize && subreq->len > rreq->rsize) - subreq->len = rreq->rsize; - - if (rreq->netfs_ops->clamp_length && - !rreq->netfs_ops->clamp_length(subreq)) { - source = NETFS_INVALID_READ; - goto out; - } - - if (subreq->max_nr_segs) { - lsize = netfs_limit_iter(io_iter, 0, subreq->len, - subreq->max_nr_segs); - if (subreq->len > lsize) { - subreq->len = lsize; - trace_netfs_sreq(subreq, netfs_sreq_trace_limited); - } - } - } - -set: - if (subreq->len > rreq->len) - pr_warn("R=%08x[%u] SREQ>RREQ %zx > %llx\n", - rreq->debug_id, subreq->debug_index, - subreq->len, rreq->len); - - if (WARN_ON(subreq->len == 0)) { - source = NETFS_INVALID_READ; - goto out; - } - - subreq->source = source; - trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); - - subreq->io_iter = *io_iter; - iov_iter_truncate(&subreq->io_iter, subreq->len); - iov_iter_advance(io_iter, subreq->len); -out: - subreq->source = source; - trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); - return source; -} - -/* - * Slice off a piece of a read request and submit an I/O request for it. - */ -static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq, - struct iov_iter *io_iter) -{ - struct netfs_io_subrequest *subreq; - enum netfs_io_source source; - - subreq = netfs_alloc_subrequest(rreq); - if (!subreq) - return false; - - subreq->start = rreq->start + rreq->submitted; - subreq->len = io_iter->count; - - _debug("slice %llx,%zx,%llx", subreq->start, subreq->len, rreq->submitted); - list_add_tail(&subreq->rreq_link, &rreq->subrequests); - - /* Call out to the cache to find out what it can do with the remaining - * subset. It tells us in subreq->flags what it decided should be done - * and adjusts subreq->len down if the subset crosses a cache boundary. - * - * Then when we hand the subset, it can choose to take a subset of that - * (the starts must coincide), in which case, we go around the loop - * again and ask it to download the next piece. - */ - source = netfs_rreq_prepare_read(rreq, subreq, io_iter); - if (source == NETFS_INVALID_READ) - goto subreq_failed; - - atomic_inc(&rreq->nr_outstanding); - - rreq->submitted += subreq->len; - - trace_netfs_sreq(subreq, netfs_sreq_trace_submit); - switch (source) { - case NETFS_FILL_WITH_ZEROES: - netfs_fill_with_zeroes(rreq, subreq); - break; - case NETFS_DOWNLOAD_FROM_SERVER: - netfs_read_from_server(rreq, subreq); - break; - case NETFS_READ_FROM_CACHE: - netfs_read_from_cache(rreq, subreq, NETFS_READ_HOLE_IGNORE); - break; - default: - BUG(); - } - - return true; - -subreq_failed: - rreq->error = subreq->error; - netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_failed); - return false; -} - -/* - * Begin the process of reading in a chunk of data, where that data may be - * stitched together from multiple sources, including multiple servers and the - * local cache. - */ -int netfs_begin_read(struct netfs_io_request *rreq, bool sync) -{ - struct iov_iter io_iter; - int ret; - - _enter("R=%x %llx-%llx", - rreq->debug_id, rreq->start, rreq->start + rreq->len - 1); - - if (rreq->len == 0) { - pr_err("Zero-sized read [R=%x]\n", rreq->debug_id); - return -EIO; - } - - if (rreq->origin == NETFS_DIO_READ) - inode_dio_begin(rreq->inode); - - // TODO: Use bounce buffer if requested - rreq->io_iter = rreq->iter; - - INIT_WORK(&rreq->work, netfs_rreq_work); - - /* Chop the read into slices according to what the cache and the netfs - * want and submit each one. - */ - netfs_get_request(rreq, netfs_rreq_trace_get_for_outstanding); - atomic_set(&rreq->nr_outstanding, 1); - io_iter = rreq->io_iter; - do { - _debug("submit %llx + %llx >= %llx", - rreq->start, rreq->submitted, rreq->i_size); - if (!netfs_rreq_submit_slice(rreq, &io_iter)) - break; - if (test_bit(NETFS_SREQ_NO_PROGRESS, &rreq->flags)) - break; - if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) && - test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags)) - break; - - } while (rreq->submitted < rreq->len); - - if (!rreq->submitted) { - netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit); - if (rreq->origin == NETFS_DIO_READ) - inode_dio_end(rreq->inode); - ret = 0; - goto out; - } - - if (sync) { - /* Keep nr_outstanding incremented so that the ref always - * belongs to us, and the service code isn't punted off to a - * random thread pool to process. Note that this might start - * further work, such as writing to the cache. - */ - wait_var_event(&rreq->nr_outstanding, - atomic_read(&rreq->nr_outstanding) == 1); - if (atomic_dec_and_test(&rreq->nr_outstanding)) - netfs_rreq_assess(rreq, false); - - trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip); - wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS, - TASK_UNINTERRUPTIBLE); - - ret = rreq->error; - if (ret == 0) { - if (rreq->origin == NETFS_DIO_READ) { - ret = rreq->transferred; - } else if (rreq->submitted < rreq->len) { - trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read); - ret = -EIO; - } - } - } else { - /* If we decrement nr_outstanding to 0, the ref belongs to us. */ - if (atomic_dec_and_test(&rreq->nr_outstanding)) - netfs_rreq_assess(rreq, false); - ret = -EIOCBQUEUED; - } - -out: - return ret; -} diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c index b781bbbf1d8d..72a435e5fc6d 100644 --- a/fs/netfs/iterator.c +++ b/fs/netfs/iterator.c @@ -188,9 +188,59 @@ static size_t netfs_limit_xarray(const struct iov_iter *iter, size_t start_offse return min(span, max_size); } +/* + * Select the span of a folio queue iterator we're going to use. Limit it by + * both maximum size and maximum number of segments. Returns the size of the + * span in bytes. + */ +static size_t netfs_limit_folioq(const struct iov_iter *iter, size_t start_offset, + size_t max_size, size_t max_segs) +{ + const struct folio_queue *folioq = iter->folioq; + unsigned int nsegs = 0; + unsigned int slot = iter->folioq_slot; + size_t span = 0, n = iter->count; + + if (WARN_ON(!iov_iter_is_folioq(iter)) || + WARN_ON(start_offset > n) || + n == 0) + return 0; + max_size = umin(max_size, n - start_offset); + + if (slot >= folioq_nr_slots(folioq)) { + folioq = folioq->next; + slot = 0; + } + + start_offset += iter->iov_offset; + do { + size_t flen = folioq_folio_size(folioq, slot); + + if (start_offset < flen) { + span += flen - start_offset; + nsegs++; + start_offset = 0; + } else { + start_offset -= flen; + } + if (span >= max_size || nsegs >= max_segs) + break; + + slot++; + if (slot >= folioq_nr_slots(folioq)) { + folioq = folioq->next; + slot = 0; + } + } while (folioq); + + return umin(span, max_size); +} + size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset, size_t max_size, size_t max_segs) { + if (iov_iter_is_folioq(iter)) + return netfs_limit_folioq(iter, start_offset, max_size, max_segs); if (iov_iter_is_bvec(iter)) return netfs_limit_bvec(iter, start_offset, max_size, max_segs); if (iov_iter_is_xarray(iter)) diff --git a/fs/netfs/main.c b/fs/netfs/main.c index 9d6b49dc6694..6c7be1377ee0 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -36,13 +36,14 @@ DEFINE_SPINLOCK(netfs_proc_lock); static const char *netfs_origins[nr__netfs_io_origin] = { [NETFS_READAHEAD] = "RA", [NETFS_READPAGE] = "RP", + [NETFS_READ_GAPS] = "RG", [NETFS_READ_FOR_WRITE] = "RW", - [NETFS_COPY_TO_CACHE] = "CC", + [NETFS_DIO_READ] = "DR", [NETFS_WRITEBACK] = "WB", [NETFS_WRITETHROUGH] = "WT", [NETFS_UNBUFFERED_WRITE] = "UW", - [NETFS_DIO_READ] = "DR", [NETFS_DIO_WRITE] = "DW", + [NETFS_PGPRIV2_COPY_TO_CACHE] = "2C", }; /* @@ -62,7 +63,7 @@ static int netfs_requests_seq_show(struct seq_file *m, void *v) rreq = list_entry(v, struct netfs_io_request, proc_link); seq_printf(m, - "%08x %s %3d %2lx %4d %3d @%04llx %llx/%llx", + "%08x %s %3d %2lx %4ld %3d @%04llx %llx/%llx", rreq->debug_id, netfs_origins[rreq->origin], refcount_read(&rreq->ref), diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index c1f321cf5999..0ad0982ce0e2 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -8,6 +8,100 @@ #include #include "internal.h" +/* + * Append a folio to the rolling queue. + */ +int netfs_buffer_append_folio(struct netfs_io_request *rreq, struct folio *folio, + bool needs_put) +{ + struct folio_queue *tail = rreq->buffer_tail; + unsigned int slot, order = folio_order(folio); + + if (WARN_ON_ONCE(!rreq->buffer && tail) || + WARN_ON_ONCE(rreq->buffer && !tail)) + return -EIO; + + if (!tail || folioq_full(tail)) { + tail = kmalloc(sizeof(*tail), GFP_NOFS); + if (!tail) + return -ENOMEM; + netfs_stat(&netfs_n_folioq); + folioq_init(tail); + tail->prev = rreq->buffer_tail; + if (tail->prev) + tail->prev->next = tail; + rreq->buffer_tail = tail; + if (!rreq->buffer) { + rreq->buffer = tail; + iov_iter_folio_queue(&rreq->io_iter, ITER_SOURCE, tail, 0, 0, 0); + } + rreq->buffer_tail_slot = 0; + } + + rreq->io_iter.count += PAGE_SIZE << order; + + slot = folioq_append(tail, folio); + /* Store the counter after setting the slot. */ + smp_store_release(&rreq->buffer_tail_slot, slot); + return 0; +} + +/* + * Delete the head of a rolling queue. + */ +struct folio_queue *netfs_delete_buffer_head(struct netfs_io_request *wreq) +{ + struct folio_queue *head = wreq->buffer, *next = head->next; + + if (next) + next->prev = NULL; + netfs_stat_d(&netfs_n_folioq); + kfree(head); + wreq->buffer = next; + return next; +} + +/* + * Clear out a rolling queue. + */ +void netfs_clear_buffer(struct netfs_io_request *rreq) +{ + struct folio_queue *p; + + while ((p = rreq->buffer)) { + rreq->buffer = p->next; + for (int slot = 0; slot < folioq_nr_slots(p); slot++) { + struct folio *folio = folioq_folio(p, slot); + if (!folio) + continue; + if (folioq_is_marked(p, slot)) { + trace_netfs_folio(folio, netfs_folio_trace_put); + folio_put(folio); + } + } + netfs_stat_d(&netfs_n_folioq); + kfree(p); + } +} + +/* + * Reset the subrequest iterator to refer just to the region remaining to be + * read. The iterator may or may not have been advanced by socket ops or + * extraction ops to an extent that may or may not match the amount actually + * read. + */ +void netfs_reset_iter(struct netfs_io_subrequest *subreq) +{ + struct iov_iter *io_iter = &subreq->io_iter; + size_t remain = subreq->len - subreq->transferred; + + if (io_iter->count > remain) + iov_iter_advance(io_iter, io_iter->count - remain); + else if (io_iter->count < remain) + iov_iter_revert(io_iter, remain - io_iter->count); + iov_iter_truncate(&subreq->io_iter, remain); +} + /** * netfs_dirty_folio - Mark folio dirty and pin a cache object for writeback * @mapping: The mapping the folio belongs to. diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index 0294df70c3ff..31e388ec6e48 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -36,7 +36,6 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, memset(rreq, 0, kmem_cache_size(cache)); rreq->start = start; rreq->len = len; - rreq->upper_len = len; rreq->origin = origin; rreq->netfs_ops = ctx->ops; rreq->mapping = mapping; @@ -44,13 +43,23 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, rreq->i_size = i_size_read(inode); rreq->debug_id = atomic_inc_return(&debug_ids); rreq->wsize = INT_MAX; + rreq->io_streams[0].sreq_max_len = ULONG_MAX; + rreq->io_streams[0].sreq_max_segs = 0; spin_lock_init(&rreq->lock); INIT_LIST_HEAD(&rreq->io_streams[0].subrequests); INIT_LIST_HEAD(&rreq->io_streams[1].subrequests); INIT_LIST_HEAD(&rreq->subrequests); - INIT_WORK(&rreq->work, NULL); refcount_set(&rreq->ref, 1); + if (origin == NETFS_READAHEAD || + origin == NETFS_READPAGE || + origin == NETFS_READ_GAPS || + origin == NETFS_READ_FOR_WRITE || + origin == NETFS_DIO_READ) + INIT_WORK(&rreq->work, netfs_read_termination_worker); + else + INIT_WORK(&rreq->work, netfs_write_collection_worker); + __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); if (file && file->f_flags & O_NONBLOCK) __set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags); @@ -134,6 +143,7 @@ static void netfs_free_request(struct work_struct *work) } kvfree(rreq->direct_bv); } + netfs_clear_buffer(rreq); if (atomic_dec_and_test(&ictx->io_count)) wake_up_var(&ictx->io_count); @@ -155,7 +165,7 @@ void netfs_put_request(struct netfs_io_request *rreq, bool was_async, if (was_async) { rreq->work.func = netfs_free_request; if (!queue_work(system_unbound_wq, &rreq->work)) - BUG(); + WARN_ON(1); } else { netfs_free_request(&rreq->work); } diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c new file mode 100644 index 000000000000..b18c65ba5580 --- /dev/null +++ b/fs/netfs/read_collect.c @@ -0,0 +1,544 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Network filesystem read subrequest result collection, assessment and + * retrying. + * + * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include +#include +#include +#include +#include +#include +#include "internal.h" + +/* + * Clear the unread part of an I/O request. + */ +static void netfs_clear_unread(struct netfs_io_subrequest *subreq) +{ + netfs_reset_iter(subreq); + WARN_ON_ONCE(subreq->len - subreq->transferred != iov_iter_count(&subreq->io_iter)); + iov_iter_zero(iov_iter_count(&subreq->io_iter), &subreq->io_iter); + if (subreq->start + subreq->transferred >= subreq->rreq->i_size) + __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); +} + +/* + * Flush, mark and unlock a folio that's now completely read. If we want to + * cache the folio, we set the group to NETFS_FOLIO_COPY_TO_CACHE, mark it + * dirty and let writeback handle it. + */ +static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq, + struct netfs_io_request *rreq, + struct folio_queue *folioq, + int slot) +{ + struct netfs_folio *finfo; + struct folio *folio = folioq_folio(folioq, slot); + + flush_dcache_folio(folio); + folio_mark_uptodate(folio); + + if (!test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) { + finfo = netfs_folio_info(folio); + if (finfo) { + trace_netfs_folio(folio, netfs_folio_trace_filled_gaps); + if (finfo->netfs_group) + folio_change_private(folio, finfo->netfs_group); + else + folio_detach_private(folio); + kfree(finfo); + } + + if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) { + if (!WARN_ON_ONCE(folio_get_private(folio) != NULL)) { + trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); + folio_attach_private(folio, NETFS_FOLIO_COPY_TO_CACHE); + folio_mark_dirty(folio); + } + } else { + trace_netfs_folio(folio, netfs_folio_trace_read_done); + } + } else { + // TODO: Use of PG_private_2 is deprecated. + if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) + netfs_pgpriv2_mark_copy_to_cache(subreq, rreq, folioq, slot); + } + + if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { + if (folio->index == rreq->no_unlock_folio && + test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) { + _debug("no unlock"); + } else { + trace_netfs_folio(folio, netfs_folio_trace_read_unlock); + folio_unlock(folio); + } + } +} + +/* + * Unlock any folios that are now completely read. Returns true if the + * subrequest is removed from the list. + */ +static bool netfs_consume_read_data(struct netfs_io_subrequest *subreq, bool was_async) +{ + struct netfs_io_subrequest *prev, *next; + struct netfs_io_request *rreq = subreq->rreq; + struct folio_queue *folioq = subreq->curr_folioq; + size_t avail, prev_donated, next_donated, fsize, part, excess; + loff_t fpos, start; + loff_t fend; + int slot = subreq->curr_folioq_slot; + + if (WARN(subreq->transferred > subreq->len, + "Subreq overread: R%x[%x] %zu > %zu", + rreq->debug_id, subreq->debug_index, + subreq->transferred, subreq->len)) + subreq->transferred = subreq->len; + +next_folio: + fsize = PAGE_SIZE << subreq->curr_folio_order; + fpos = round_down(subreq->start + subreq->consumed, fsize); + fend = fpos + fsize; + + if (WARN_ON_ONCE(!folioq) || + WARN_ON_ONCE(!folioq_folio(folioq, slot)) || + WARN_ON_ONCE(folioq_folio(folioq, slot)->index != fpos / PAGE_SIZE)) { + pr_err("R=%08x[%x] s=%llx-%llx ctl=%zx/%zx/%zx sl=%u\n", + rreq->debug_id, subreq->debug_index, + subreq->start, subreq->start + subreq->transferred - 1, + subreq->consumed, subreq->transferred, subreq->len, + slot); + if (folioq) { + struct folio *folio = folioq_folio(folioq, slot); + + pr_err("folioq: orders=%02x%02x%02x%02x\n", + folioq->orders[0], folioq->orders[1], + folioq->orders[2], folioq->orders[3]); + if (folio) + pr_err("folio: %llx-%llx ix=%llx o=%u qo=%u\n", + fpos, fend - 1, folio_pos(folio), folio_order(folio), + folioq_folio_order(folioq, slot)); + } + } + +donation_changed: + /* Try to consume the current folio if we've hit or passed the end of + * it. There's a possibility that this subreq doesn't start at the + * beginning of the folio, in which case we need to donate to/from the + * preceding subreq. + * + * We also need to include any potential donation back from the + * following subreq. + */ + prev_donated = READ_ONCE(subreq->prev_donated); + next_donated = READ_ONCE(subreq->next_donated); + if (prev_donated || next_donated) { + spin_lock_bh(&rreq->lock); + prev_donated = subreq->prev_donated; + next_donated = subreq->next_donated; + subreq->start -= prev_donated; + subreq->len += prev_donated; + subreq->transferred += prev_donated; + prev_donated = subreq->prev_donated = 0; + if (subreq->transferred == subreq->len) { + subreq->len += next_donated; + subreq->transferred += next_donated; + next_donated = subreq->next_donated = 0; + } + trace_netfs_sreq(subreq, netfs_sreq_trace_add_donations); + spin_unlock_bh(&rreq->lock); + } + + avail = subreq->transferred; + if (avail == subreq->len) + avail += next_donated; + start = subreq->start; + if (subreq->consumed == 0) { + start -= prev_donated; + avail += prev_donated; + } else { + start += subreq->consumed; + avail -= subreq->consumed; + } + part = umin(avail, fsize); + + trace_netfs_progress(subreq, start, avail, part); + + if (start + avail >= fend) { + if (fpos == start) { + /* Flush, unlock and mark for caching any folio we've just read. */ + subreq->consumed = fend - subreq->start; + netfs_unlock_read_folio(subreq, rreq, folioq, slot); + folioq_mark2(folioq, slot); + if (subreq->consumed >= subreq->len) + goto remove_subreq; + } else if (fpos < start) { + excess = fend - subreq->start; + + spin_lock_bh(&rreq->lock); + /* If we complete first on a folio split with the + * preceding subreq, donate to that subreq - otherwise + * we get the responsibility. + */ + if (subreq->prev_donated != prev_donated) { + spin_unlock_bh(&rreq->lock); + goto donation_changed; + } + + if (list_is_first(&subreq->rreq_link, &rreq->subrequests)) { + spin_unlock_bh(&rreq->lock); + pr_err("Can't donate prior to front\n"); + goto bad; + } + + prev = list_prev_entry(subreq, rreq_link); + WRITE_ONCE(prev->next_donated, prev->next_donated + excess); + subreq->start += excess; + subreq->len -= excess; + subreq->transferred -= excess; + trace_netfs_donate(rreq, subreq, prev, excess, + netfs_trace_donate_tail_to_prev); + trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_prev); + + if (subreq->consumed >= subreq->len) + goto remove_subreq_locked; + spin_unlock_bh(&rreq->lock); + } else { + pr_err("fpos > start\n"); + goto bad; + } + + /* Advance the rolling buffer to the next folio. */ + slot++; + if (slot >= folioq_nr_slots(folioq)) { + slot = 0; + folioq = folioq->next; + subreq->curr_folioq = folioq; + } + subreq->curr_folioq_slot = slot; + if (folioq && folioq_folio(folioq, slot)) + subreq->curr_folio_order = folioq->orders[slot]; + if (!was_async) + cond_resched(); + goto next_folio; + } + + /* Deal with partial progress. */ + if (subreq->transferred < subreq->len) + return false; + + /* Donate the remaining downloaded data to one of the neighbouring + * subrequests. Note that we may race with them doing the same thing. + */ + spin_lock_bh(&rreq->lock); + + if (subreq->prev_donated != prev_donated || + subreq->next_donated != next_donated) { + spin_unlock_bh(&rreq->lock); + cond_resched(); + goto donation_changed; + } + + /* Deal with the trickiest case: that this subreq is in the middle of a + * folio, not touching either edge, but finishes first. In such a + * case, we donate to the previous subreq, if there is one, so that the + * donation is only handled when that completes - and remove this + * subreq from the list. + * + * If the previous subreq finished first, we will have acquired their + * donation and should be able to unlock folios and/or donate nextwards. + */ + if (!subreq->consumed && + !prev_donated && + !list_is_first(&subreq->rreq_link, &rreq->subrequests)) { + prev = list_prev_entry(subreq, rreq_link); + WRITE_ONCE(prev->next_donated, prev->next_donated + subreq->len); + subreq->start += subreq->len; + subreq->len = 0; + subreq->transferred = 0; + trace_netfs_donate(rreq, subreq, prev, subreq->len, + netfs_trace_donate_to_prev); + trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_prev); + goto remove_subreq_locked; + } + + /* If we can't donate down the chain, donate up the chain instead. */ + excess = subreq->len - subreq->consumed + next_donated; + + if (!subreq->consumed) + excess += prev_donated; + + if (list_is_last(&subreq->rreq_link, &rreq->subrequests)) { + rreq->prev_donated = excess; + trace_netfs_donate(rreq, subreq, NULL, excess, + netfs_trace_donate_to_deferred_next); + } else { + next = list_next_entry(subreq, rreq_link); + WRITE_ONCE(next->prev_donated, excess); + trace_netfs_donate(rreq, subreq, next, excess, + netfs_trace_donate_to_next); + } + trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_next); + subreq->len = subreq->consumed; + subreq->transferred = subreq->consumed; + goto remove_subreq_locked; + +remove_subreq: + spin_lock_bh(&rreq->lock); +remove_subreq_locked: + subreq->consumed = subreq->len; + list_del(&subreq->rreq_link); + spin_unlock_bh(&rreq->lock); + netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_consumed); + return true; + +bad: + /* Errr... prev and next both donated to us, but insufficient to finish + * the folio. + */ + printk("R=%08x[%x] s=%llx-%llx %zx/%zx/%zx\n", + rreq->debug_id, subreq->debug_index, + subreq->start, subreq->start + subreq->transferred - 1, + subreq->consumed, subreq->transferred, subreq->len); + printk("folio: %llx-%llx\n", fpos, fend - 1); + printk("donated: prev=%zx next=%zx\n", prev_donated, next_donated); + printk("s=%llx av=%zx part=%zx\n", start, avail, part); + BUG(); +} + +/* + * Do page flushing and suchlike after DIO. + */ +static void netfs_rreq_assess_dio(struct netfs_io_request *rreq) +{ + struct netfs_io_subrequest *subreq; + unsigned int i; + + /* Collect unbuffered reads and direct reads, adding up the transfer + * sizes until we find the first short or failed subrequest. + */ + list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + rreq->transferred += subreq->transferred; + + if (subreq->transferred < subreq->len || + test_bit(NETFS_SREQ_FAILED, &subreq->flags)) { + rreq->error = subreq->error; + break; + } + } + + if (rreq->origin == NETFS_DIO_READ) { + for (i = 0; i < rreq->direct_bv_count; i++) { + flush_dcache_page(rreq->direct_bv[i].bv_page); + // TODO: cifs marks pages in the destination buffer + // dirty under some circumstances after a read. Do we + // need to do that too? + set_page_dirty(rreq->direct_bv[i].bv_page); + } + } + + if (rreq->iocb) { + rreq->iocb->ki_pos += rreq->transferred; + if (rreq->iocb->ki_complete) + rreq->iocb->ki_complete( + rreq->iocb, rreq->error ? rreq->error : rreq->transferred); + } + if (rreq->netfs_ops->done) + rreq->netfs_ops->done(rreq); + if (rreq->origin == NETFS_DIO_READ) + inode_dio_end(rreq->inode); +} + +/* + * Assess the state of a read request and decide what to do next. + * + * Note that we're in normal kernel thread context at this point, possibly + * running on a workqueue. + */ +static void netfs_rreq_assess(struct netfs_io_request *rreq) +{ + trace_netfs_rreq(rreq, netfs_rreq_trace_assess); + + //netfs_rreq_is_still_valid(rreq); + + if (test_and_clear_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags)) { + netfs_retry_reads(rreq); + return; + } + + if (rreq->origin == NETFS_DIO_READ || + rreq->origin == NETFS_READ_GAPS) + netfs_rreq_assess_dio(rreq); + task_io_account_read(rreq->transferred); + + trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip); + clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags); + wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS); + + trace_netfs_rreq(rreq, netfs_rreq_trace_done); + netfs_clear_subrequests(rreq, false); + netfs_unlock_abandoned_read_pages(rreq); + if (unlikely(test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags))) + netfs_pgpriv2_write_to_the_cache(rreq); +} + +void netfs_read_termination_worker(struct work_struct *work) +{ + struct netfs_io_request *rreq = + container_of(work, struct netfs_io_request, work); + netfs_see_request(rreq, netfs_rreq_trace_see_work); + netfs_rreq_assess(rreq); + netfs_put_request(rreq, false, netfs_rreq_trace_put_work_complete); +} + +/* + * Handle the completion of all outstanding I/O operations on a read request. + * We inherit a ref from the caller. + */ +void netfs_rreq_terminated(struct netfs_io_request *rreq, bool was_async) +{ + if (!was_async) + return netfs_rreq_assess(rreq); + if (!work_pending(&rreq->work)) { + netfs_get_request(rreq, netfs_rreq_trace_get_work); + if (!queue_work(system_unbound_wq, &rreq->work)) + netfs_put_request(rreq, was_async, netfs_rreq_trace_put_work_nq); + } +} + +/** + * netfs_read_subreq_progress - Note progress of a read operation. + * @subreq: The read request that has terminated. + * @was_async: True if we're in an asynchronous context. + * + * This tells the read side of netfs lib that a contributory I/O operation has + * made some progress and that it may be possible to unlock some folios. + * + * Before calling, the filesystem should update subreq->transferred to track + * the amount of data copied into the output buffer. + * + * If @was_async is true, the caller might be running in softirq or interrupt + * context and we can't sleep. + */ +void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq, + bool was_async) +{ + struct netfs_io_request *rreq = subreq->rreq; + + trace_netfs_sreq(subreq, netfs_sreq_trace_progress); + + if (subreq->transferred > subreq->consumed && + (rreq->origin == NETFS_READAHEAD || + rreq->origin == NETFS_READPAGE || + rreq->origin == NETFS_READ_FOR_WRITE)) { + netfs_consume_read_data(subreq, was_async); + __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); + } +} +EXPORT_SYMBOL(netfs_read_subreq_progress); + +/** + * netfs_read_subreq_terminated - Note the termination of an I/O operation. + * @subreq: The I/O request that has terminated. + * @error: Error code indicating type of completion. + * @was_async: The termination was asynchronous + * + * This tells the read helper that a contributory I/O operation has terminated, + * one way or another, and that it should integrate the results. + * + * The caller indicates the outcome of the operation through @error, supplying + * 0 to indicate a successful or retryable transfer (if NETFS_SREQ_NEED_RETRY + * is set) or a negative error code. The helper will look after reissuing I/O + * operations as appropriate and writing downloaded data to the cache. + * + * Before calling, the filesystem should update subreq->transferred to track + * the amount of data copied into the output buffer. + * + * If @was_async is true, the caller might be running in softirq or interrupt + * context and we can't sleep. + */ +void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, + int error, bool was_async) +{ + struct netfs_io_request *rreq = subreq->rreq; + + switch (subreq->source) { + case NETFS_READ_FROM_CACHE: + netfs_stat(&netfs_n_rh_read_done); + break; + case NETFS_DOWNLOAD_FROM_SERVER: + netfs_stat(&netfs_n_rh_download_done); + break; + default: + break; + } + + if (rreq->origin != NETFS_DIO_READ) { + /* Collect buffered reads. + * + * If the read completed validly short, then we can clear the + * tail before going on to unlock the folios. + */ + if (error == 0 && subreq->transferred < subreq->len && + (test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags) || + test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags))) { + netfs_clear_unread(subreq); + subreq->transferred = subreq->len; + trace_netfs_sreq(subreq, netfs_sreq_trace_clear); + } + if (subreq->transferred > subreq->consumed && + (rreq->origin == NETFS_READAHEAD || + rreq->origin == NETFS_READPAGE || + rreq->origin == NETFS_READ_FOR_WRITE)) { + netfs_consume_read_data(subreq, was_async); + __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); + } + rreq->transferred += subreq->transferred; + } + + /* Deal with retry requests, short reads and errors. If we retry + * but don't make progress, we abandon the attempt. + */ + if (!error && subreq->transferred < subreq->len) { + if (test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags)) { + trace_netfs_sreq(subreq, netfs_sreq_trace_hit_eof); + } else { + trace_netfs_sreq(subreq, netfs_sreq_trace_short); + if (subreq->transferred > subreq->consumed) { + __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); + __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); + set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags); + } else if (!__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) { + __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); + set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags); + } else { + __set_bit(NETFS_SREQ_FAILED, &subreq->flags); + error = -ENODATA; + } + } + } + + subreq->error = error; + trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); + + if (unlikely(error < 0)) { + trace_netfs_failure(rreq, subreq, error, netfs_fail_read); + if (subreq->source == NETFS_READ_FROM_CACHE) { + netfs_stat(&netfs_n_rh_read_failed); + } else { + netfs_stat(&netfs_n_rh_download_failed); + set_bit(NETFS_RREQ_FAILED, &rreq->flags); + rreq->error = subreq->error; + } + } + + if (atomic_dec_and_test(&rreq->nr_outstanding)) + netfs_rreq_terminated(rreq, was_async); + + netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated); +} +EXPORT_SYMBOL(netfs_read_subreq_terminated); diff --git a/fs/netfs/read_pgpriv2.c b/fs/netfs/read_pgpriv2.c new file mode 100644 index 000000000000..ba5af89d37fa --- /dev/null +++ b/fs/netfs/read_pgpriv2.c @@ -0,0 +1,264 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Read with PG_private_2 [DEPRECATED]. + * + * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include +#include +#include +#include +#include +#include +#include "internal.h" + +/* + * [DEPRECATED] Mark page as requiring copy-to-cache using PG_private_2. The + * third mark in the folio queue is used to indicate that this folio needs + * writing. + */ +void netfs_pgpriv2_mark_copy_to_cache(struct netfs_io_subrequest *subreq, + struct netfs_io_request *rreq, + struct folio_queue *folioq, + int slot) +{ + struct folio *folio = folioq_folio(folioq, slot); + + trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); + folio_start_private_2(folio); + folioq_mark3(folioq, slot); +} + +/* + * [DEPRECATED] Cancel PG_private_2 on all marked folios in the event of an + * unrecoverable error. + */ +static void netfs_pgpriv2_cancel(struct folio_queue *folioq) +{ + struct folio *folio; + int slot; + + while (folioq) { + if (!folioq->marks3) { + folioq = folioq->next; + continue; + } + + slot = __ffs(folioq->marks3); + folio = folioq_folio(folioq, slot); + + trace_netfs_folio(folio, netfs_folio_trace_cancel_copy); + folio_end_private_2(folio); + folioq_unmark3(folioq, slot); + } +} + +/* + * [DEPRECATED] Copy a folio to the cache with PG_private_2 set. + */ +static int netfs_pgpriv2_copy_folio(struct netfs_io_request *wreq, struct folio *folio) +{ + struct netfs_io_stream *cache = &wreq->io_streams[1]; + size_t fsize = folio_size(folio), flen = fsize; + loff_t fpos = folio_pos(folio), i_size; + bool to_eof = false; + + _enter(""); + + /* netfs_perform_write() may shift i_size around the page or from out + * of the page to beyond it, but cannot move i_size into or through the + * page since we have it locked. + */ + i_size = i_size_read(wreq->inode); + + if (fpos >= i_size) { + /* mmap beyond eof. */ + _debug("beyond eof"); + folio_end_private_2(folio); + return 0; + } + + if (fpos + fsize > wreq->i_size) + wreq->i_size = i_size; + + if (flen > i_size - fpos) { + flen = i_size - fpos; + to_eof = true; + } else if (flen == i_size - fpos) { + to_eof = true; + } + + _debug("folio %zx %zx", flen, fsize); + + trace_netfs_folio(folio, netfs_folio_trace_store_copy); + + /* Attach the folio to the rolling buffer. */ + if (netfs_buffer_append_folio(wreq, folio, false) < 0) + return -ENOMEM; + + cache->submit_extendable_to = fsize; + cache->submit_off = 0; + cache->submit_len = flen; + + /* Attach the folio to one or more subrequests. For a big folio, we + * could end up with thousands of subrequests if the wsize is small - + * but we might need to wait during the creation of subrequests for + * network resources (eg. SMB credits). + */ + do { + ssize_t part; + + wreq->io_iter.iov_offset = cache->submit_off; + + atomic64_set(&wreq->issued_to, fpos + cache->submit_off); + cache->submit_extendable_to = fsize - cache->submit_off; + part = netfs_advance_write(wreq, cache, fpos + cache->submit_off, + cache->submit_len, to_eof); + cache->submit_off += part; + if (part > cache->submit_len) + cache->submit_len = 0; + else + cache->submit_len -= part; + } while (cache->submit_len > 0); + + wreq->io_iter.iov_offset = 0; + iov_iter_advance(&wreq->io_iter, fsize); + atomic64_set(&wreq->issued_to, fpos + fsize); + + if (flen < fsize) + netfs_issue_write(wreq, cache); + + _leave(" = 0"); + return 0; +} + +/* + * [DEPRECATED] Go through the buffer and write any folios that are marked with + * the third mark to the cache. + */ +void netfs_pgpriv2_write_to_the_cache(struct netfs_io_request *rreq) +{ + struct netfs_io_request *wreq; + struct folio_queue *folioq; + struct folio *folio; + int error = 0; + int slot = 0; + + _enter(""); + + if (!fscache_resources_valid(&rreq->cache_resources)) + goto couldnt_start; + + /* Need the first folio to be able to set up the op. */ + for (folioq = rreq->buffer; folioq; folioq = folioq->next) { + if (folioq->marks3) { + slot = __ffs(folioq->marks3); + break; + } + } + if (!folioq) + return; + folio = folioq_folio(folioq, slot); + + wreq = netfs_create_write_req(rreq->mapping, NULL, folio_pos(folio), + NETFS_PGPRIV2_COPY_TO_CACHE); + if (IS_ERR(wreq)) { + kleave(" [create %ld]", PTR_ERR(wreq)); + goto couldnt_start; + } + + trace_netfs_write(wreq, netfs_write_trace_copy_to_cache); + netfs_stat(&netfs_n_wh_copy_to_cache); + + for (;;) { + error = netfs_pgpriv2_copy_folio(wreq, folio); + if (error < 0) + break; + + folioq_unmark3(folioq, slot); + if (!folioq->marks3) { + folioq = folioq->next; + if (!folioq) + break; + } + + slot = __ffs(folioq->marks3); + folio = folioq_folio(folioq, slot); + } + + netfs_issue_write(wreq, &wreq->io_streams[1]); + smp_wmb(); /* Write lists before ALL_QUEUED. */ + set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags); + + netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + _leave(" = %d", error); +couldnt_start: + netfs_pgpriv2_cancel(rreq->buffer); +} + +/* + * [DEPRECATED] Remove the PG_private_2 mark from any folios we've finished + * copying. + */ +bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq) +{ + struct folio_queue *folioq = wreq->buffer; + unsigned long long collected_to = wreq->collected_to; + unsigned int slot = wreq->buffer_head_slot; + bool made_progress = false; + + if (slot >= folioq_nr_slots(folioq)) { + folioq = netfs_delete_buffer_head(wreq); + slot = 0; + } + + for (;;) { + struct folio *folio; + unsigned long long fpos, fend; + size_t fsize, flen; + + folio = folioq_folio(folioq, slot); + if (WARN_ONCE(!folio_test_private_2(folio), + "R=%08x: folio %lx is not marked private_2\n", + wreq->debug_id, folio->index)) + trace_netfs_folio(folio, netfs_folio_trace_not_under_wback); + + fpos = folio_pos(folio); + fsize = folio_size(folio); + flen = fsize; + + fend = min_t(unsigned long long, fpos + flen, wreq->i_size); + + trace_netfs_collect_folio(wreq, folio, fend, collected_to); + + /* Unlock any folio we've transferred all of. */ + if (collected_to < fend) + break; + + trace_netfs_folio(folio, netfs_folio_trace_end_copy); + folio_end_private_2(folio); + wreq->cleaned_to = fpos + fsize; + made_progress = true; + + /* Clean up the head folioq. If we clear an entire folioq, then + * we can get rid of it provided it's not also the tail folioq + * being filled by the issuer. + */ + folioq_clear(folioq, slot); + slot++; + if (slot >= folioq_nr_slots(folioq)) { + if (READ_ONCE(wreq->buffer_tail) == folioq) + break; + folioq = netfs_delete_buffer_head(wreq); + slot = 0; + } + + if (fpos + fsize >= collected_to) + break; + } + + wreq->buffer = folioq; + wreq->buffer_head_slot = slot; + return made_progress; +} diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c new file mode 100644 index 000000000000..0350592ea804 --- /dev/null +++ b/fs/netfs/read_retry.c @@ -0,0 +1,256 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Network filesystem read subrequest retrying. + * + * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include +#include +#include "internal.h" + +static void netfs_reissue_read(struct netfs_io_request *rreq, + struct netfs_io_subrequest *subreq) +{ + struct iov_iter *io_iter = &subreq->io_iter; + + if (iov_iter_is_folioq(io_iter)) { + subreq->curr_folioq = (struct folio_queue *)io_iter->folioq; + subreq->curr_folioq_slot = io_iter->folioq_slot; + subreq->curr_folio_order = subreq->curr_folioq->orders[subreq->curr_folioq_slot]; + } + + atomic_inc(&rreq->nr_outstanding); + __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); + netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); + subreq->rreq->netfs_ops->issue_read(subreq); +} + +/* + * Go through the list of failed/short reads, retrying all retryable ones. We + * need to switch failed cache reads to network downloads. + */ +static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) +{ + struct netfs_io_subrequest *subreq; + struct netfs_io_stream *stream0 = &rreq->io_streams[0]; + LIST_HEAD(sublist); + LIST_HEAD(queue); + + _enter("R=%x", rreq->debug_id); + + if (list_empty(&rreq->subrequests)) + return; + + if (rreq->netfs_ops->retry_request) + rreq->netfs_ops->retry_request(rreq, NULL); + + /* If there's no renegotiation to do, just resend each retryable subreq + * up to the first permanently failed one. + */ + if (!rreq->netfs_ops->prepare_read && + !test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags)) { + struct netfs_io_subrequest *subreq; + + list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { + if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) + break; + if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) { + netfs_reset_iter(subreq); + netfs_reissue_read(rreq, subreq); + } + } + return; + } + + /* Okay, we need to renegotiate all the download requests and flip any + * failed cache reads over to being download requests and negotiate + * those also. All fully successful subreqs have been removed from the + * list and any spare data from those has been donated. + * + * What we do is decant the list and rebuild it one subreq at a time so + * that we don't end up with donations jumping over a gap we're busy + * populating with smaller subrequests. In the event that the subreq + * we just launched finishes before we insert the next subreq, it'll + * fill in rreq->prev_donated instead. + + * Note: Alternatively, we could split the tail subrequest right before + * we reissue it and fix up the donations under lock. + */ + list_splice_init(&rreq->subrequests, &queue); + + do { + struct netfs_io_subrequest *from; + struct iov_iter source; + unsigned long long start, len; + size_t part, deferred_next_donated = 0; + bool boundary = false; + + /* Go through the subreqs and find the next span of contiguous + * buffer that we then rejig (cifs, for example, needs the + * rsize renegotiating) and reissue. + */ + from = list_first_entry(&queue, struct netfs_io_subrequest, rreq_link); + list_move_tail(&from->rreq_link, &sublist); + start = from->start + from->transferred; + len = from->len - from->transferred; + + _debug("from R=%08x[%x] s=%llx ctl=%zx/%zx/%zx", + rreq->debug_id, from->debug_index, + from->start, from->consumed, from->transferred, from->len); + + if (test_bit(NETFS_SREQ_FAILED, &from->flags) || + !test_bit(NETFS_SREQ_NEED_RETRY, &from->flags)) + goto abandon; + + deferred_next_donated = from->next_donated; + while ((subreq = list_first_entry_or_null( + &queue, struct netfs_io_subrequest, rreq_link))) { + if (subreq->start != start + len || + subreq->transferred > 0 || + !test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) + break; + list_move_tail(&subreq->rreq_link, &sublist); + len += subreq->len; + deferred_next_donated = subreq->next_donated; + if (test_bit(NETFS_SREQ_BOUNDARY, &subreq->flags)) + break; + } + + _debug(" - range: %llx-%llx %llx", start, start + len - 1, len); + + /* Determine the set of buffers we're going to use. Each + * subreq gets a subset of a single overall contiguous buffer. + */ + netfs_reset_iter(from); + source = from->io_iter; + source.count = len; + + /* Work through the sublist. */ + while ((subreq = list_first_entry_or_null( + &sublist, struct netfs_io_subrequest, rreq_link))) { + list_del(&subreq->rreq_link); + + subreq->source = NETFS_DOWNLOAD_FROM_SERVER; + subreq->start = start - subreq->transferred; + subreq->len = len + subreq->transferred; + stream0->sreq_max_len = subreq->len; + + __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); + __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); + + spin_lock_bh(&rreq->lock); + list_add_tail(&subreq->rreq_link, &rreq->subrequests); + subreq->prev_donated += rreq->prev_donated; + rreq->prev_donated = 0; + trace_netfs_sreq(subreq, netfs_sreq_trace_retry); + spin_unlock_bh(&rreq->lock); + + BUG_ON(!len); + + /* Renegotiate max_len (rsize) */ + if (rreq->netfs_ops->prepare_read(subreq) < 0) { + trace_netfs_sreq(subreq, netfs_sreq_trace_reprep_failed); + __set_bit(NETFS_SREQ_FAILED, &subreq->flags); + } + + part = umin(len, stream0->sreq_max_len); + if (unlikely(rreq->io_streams[0].sreq_max_segs)) + part = netfs_limit_iter(&source, 0, part, stream0->sreq_max_segs); + subreq->len = subreq->transferred + part; + subreq->io_iter = source; + iov_iter_truncate(&subreq->io_iter, part); + iov_iter_advance(&source, part); + len -= part; + start += part; + if (!len) { + if (boundary) + __set_bit(NETFS_SREQ_BOUNDARY, &subreq->flags); + subreq->next_donated = deferred_next_donated; + } else { + __clear_bit(NETFS_SREQ_BOUNDARY, &subreq->flags); + subreq->next_donated = 0; + } + + netfs_reissue_read(rreq, subreq); + if (!len) + break; + + /* If we ran out of subrequests, allocate another. */ + if (list_empty(&sublist)) { + subreq = netfs_alloc_subrequest(rreq); + if (!subreq) + goto abandon; + subreq->source = NETFS_DOWNLOAD_FROM_SERVER; + subreq->start = start; + + /* We get two refs, but need just one. */ + netfs_put_subrequest(subreq, false, netfs_sreq_trace_new); + trace_netfs_sreq(subreq, netfs_sreq_trace_split); + list_add_tail(&subreq->rreq_link, &sublist); + } + } + + /* If we managed to use fewer subreqs, we can discard the + * excess. + */ + while ((subreq = list_first_entry_or_null( + &sublist, struct netfs_io_subrequest, rreq_link))) { + trace_netfs_sreq(subreq, netfs_sreq_trace_discard); + list_del(&subreq->rreq_link); + netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done); + } + + } while (!list_empty(&queue)); + + return; + + /* If we hit ENOMEM, fail all remaining subrequests */ +abandon: + list_splice_init(&sublist, &queue); + list_for_each_entry(subreq, &queue, rreq_link) { + if (!subreq->error) + subreq->error = -ENOMEM; + __clear_bit(NETFS_SREQ_FAILED, &subreq->flags); + __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); + __clear_bit(NETFS_SREQ_RETRYING, &subreq->flags); + } + spin_lock_bh(&rreq->lock); + list_splice_tail_init(&queue, &rreq->subrequests); + spin_unlock_bh(&rreq->lock); +} + +/* + * Retry reads. + */ +void netfs_retry_reads(struct netfs_io_request *rreq) +{ + trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit); + + atomic_inc(&rreq->nr_outstanding); + + netfs_retry_read_subrequests(rreq); + + if (atomic_dec_and_test(&rreq->nr_outstanding)) + netfs_rreq_terminated(rreq, false); +} + +/* + * Unlock any the pages that haven't been unlocked yet due to abandoned + * subrequests. + */ +void netfs_unlock_abandoned_read_pages(struct netfs_io_request *rreq) +{ + struct folio_queue *p; + + for (p = rreq->buffer; p; p = p->next) { + for (int slot = 0; slot < folioq_count(p); slot++) { + struct folio *folio = folioq_folio(p, slot); + + if (folio && !folioq_is_marked2(p, slot)) { + trace_netfs_folio(folio, netfs_folio_trace_abandon); + folio_unlock(folio); + } + } + } +} diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c index 0892768eea32..8e63516b40f6 100644 --- a/fs/netfs/stats.c +++ b/fs/netfs/stats.c @@ -32,6 +32,7 @@ atomic_t netfs_n_wh_buffered_write; atomic_t netfs_n_wh_writethrough; atomic_t netfs_n_wh_dio_write; atomic_t netfs_n_wh_writepages; +atomic_t netfs_n_wh_copy_to_cache; atomic_t netfs_n_wh_wstream_conflict; atomic_t netfs_n_wh_upload; atomic_t netfs_n_wh_upload_done; @@ -39,45 +40,53 @@ atomic_t netfs_n_wh_upload_failed; atomic_t netfs_n_wh_write; atomic_t netfs_n_wh_write_done; atomic_t netfs_n_wh_write_failed; +atomic_t netfs_n_wb_lock_skip; +atomic_t netfs_n_wb_lock_wait; +atomic_t netfs_n_folioq; int netfs_stats_show(struct seq_file *m, void *v) { - seq_printf(m, "Netfs : DR=%u RA=%u RF=%u WB=%u WBZ=%u\n", + seq_printf(m, "Reads : DR=%u RA=%u RF=%u WB=%u WBZ=%u\n", atomic_read(&netfs_n_rh_dio_read), atomic_read(&netfs_n_rh_readahead), atomic_read(&netfs_n_rh_read_folio), atomic_read(&netfs_n_rh_write_begin), atomic_read(&netfs_n_rh_write_zskip)); - seq_printf(m, "Netfs : BW=%u WT=%u DW=%u WP=%u\n", + seq_printf(m, "Writes : BW=%u WT=%u DW=%u WP=%u 2C=%u\n", atomic_read(&netfs_n_wh_buffered_write), atomic_read(&netfs_n_wh_writethrough), atomic_read(&netfs_n_wh_dio_write), - atomic_read(&netfs_n_wh_writepages)); - seq_printf(m, "Netfs : ZR=%u sh=%u sk=%u\n", + atomic_read(&netfs_n_wh_writepages), + atomic_read(&netfs_n_wh_copy_to_cache)); + seq_printf(m, "ZeroOps: ZR=%u sh=%u sk=%u\n", atomic_read(&netfs_n_rh_zero), atomic_read(&netfs_n_rh_short_read), atomic_read(&netfs_n_rh_write_zskip)); - seq_printf(m, "Netfs : DL=%u ds=%u df=%u di=%u\n", + seq_printf(m, "DownOps: DL=%u ds=%u df=%u di=%u\n", atomic_read(&netfs_n_rh_download), atomic_read(&netfs_n_rh_download_done), atomic_read(&netfs_n_rh_download_failed), atomic_read(&netfs_n_rh_download_instead)); - seq_printf(m, "Netfs : RD=%u rs=%u rf=%u\n", + seq_printf(m, "CaRdOps: RD=%u rs=%u rf=%u\n", atomic_read(&netfs_n_rh_read), atomic_read(&netfs_n_rh_read_done), atomic_read(&netfs_n_rh_read_failed)); - seq_printf(m, "Netfs : UL=%u us=%u uf=%u\n", + seq_printf(m, "UpldOps: UL=%u us=%u uf=%u\n", atomic_read(&netfs_n_wh_upload), atomic_read(&netfs_n_wh_upload_done), atomic_read(&netfs_n_wh_upload_failed)); - seq_printf(m, "Netfs : WR=%u ws=%u wf=%u\n", + seq_printf(m, "CaWrOps: WR=%u ws=%u wf=%u\n", atomic_read(&netfs_n_wh_write), atomic_read(&netfs_n_wh_write_done), atomic_read(&netfs_n_wh_write_failed)); - seq_printf(m, "Netfs : rr=%u sr=%u wsc=%u\n", + seq_printf(m, "Objs : rr=%u sr=%u foq=%u wsc=%u\n", atomic_read(&netfs_n_rh_rreq), atomic_read(&netfs_n_rh_sreq), + atomic_read(&netfs_n_folioq), atomic_read(&netfs_n_wh_wstream_conflict)); + seq_printf(m, "WbLock : skip=%u wait=%u\n", + atomic_read(&netfs_n_wb_lock_skip), + atomic_read(&netfs_n_wb_lock_wait)); return fscache_stats_show(m); } EXPORT_SYMBOL(netfs_stats_show); diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index ae7a2043f670..1d438be2e1b4 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -15,15 +15,11 @@ /* Notes made in the collector */ #define HIT_PENDING 0x01 /* A front op was still pending */ -#define SOME_EMPTY 0x02 /* One of more streams are empty */ -#define ALL_EMPTY 0x04 /* All streams are empty */ -#define MAYBE_DISCONTIG 0x08 /* A front op may be discontiguous (rounded to PAGE_SIZE) */ -#define NEED_REASSESS 0x10 /* Need to loop round and reassess */ -#define REASSESS_DISCONTIG 0x20 /* Reassess discontiguity if contiguity advances */ -#define MADE_PROGRESS 0x40 /* Made progress cleaning up a stream or the folio set */ -#define BUFFERED 0x80 /* The pagecache needs cleaning up */ -#define NEED_RETRY 0x100 /* A front op requests retrying */ -#define SAW_FAILURE 0x200 /* One stream or hit a permanent failure */ +#define NEED_REASSESS 0x02 /* Need to loop round and reassess */ +#define MADE_PROGRESS 0x04 /* Made progress cleaning up a stream or the folio set */ +#define BUFFERED 0x08 /* The pagecache needs cleaning up */ +#define NEED_RETRY 0x10 /* A front op requests retrying */ +#define SAW_FAILURE 0x20 /* One stream or hit a permanent failure */ /* * Successful completion of write of a folio to the server and/or cache. Note @@ -81,56 +77,38 @@ end_wb: return gcount; } -/* - * Get hold of a folio we have under writeback. We don't want to get the - * refcount on it. - */ -static struct folio *netfs_writeback_lookup_folio(struct netfs_io_request *wreq, loff_t pos) -{ - XA_STATE(xas, &wreq->mapping->i_pages, pos / PAGE_SIZE); - struct folio *folio; - - rcu_read_lock(); - - for (;;) { - xas_reset(&xas); - folio = xas_load(&xas); - if (xas_retry(&xas, folio)) - continue; - - if (!folio || xa_is_value(folio)) - kdebug("R=%08x: folio %lx (%llx) not present", - wreq->debug_id, xas.xa_index, pos / PAGE_SIZE); - BUG_ON(!folio || xa_is_value(folio)); - - if (folio == xas_reload(&xas)) - break; - } - - rcu_read_unlock(); - - if (WARN_ONCE(!folio_test_writeback(folio), - "R=%08x: folio %lx is not under writeback\n", - wreq->debug_id, folio->index)) { - trace_netfs_folio(folio, netfs_folio_trace_not_under_wback); - } - return folio; -} - /* * Unlock any folios we've finished with. */ static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq, - unsigned long long collected_to, unsigned int *notes) { + struct folio_queue *folioq = wreq->buffer; + unsigned long long collected_to = wreq->collected_to; + unsigned int slot = wreq->buffer_head_slot; + + if (wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE) { + if (netfs_pgpriv2_unlock_copied_folios(wreq)) + *notes |= MADE_PROGRESS; + return; + } + + if (slot >= folioq_nr_slots(folioq)) { + folioq = netfs_delete_buffer_head(wreq); + slot = 0; + } + for (;;) { struct folio *folio; struct netfs_folio *finfo; unsigned long long fpos, fend; size_t fsize, flen; - folio = netfs_writeback_lookup_folio(wreq, wreq->cleaned_to); + folio = folioq_folio(folioq, slot); + if (WARN_ONCE(!folio_test_writeback(folio), + "R=%08x: folio %lx is not under writeback\n", + wreq->debug_id, folio->index)) + trace_netfs_folio(folio, netfs_folio_trace_not_under_wback); fpos = folio_pos(folio); fsize = folio_size(folio); @@ -141,12 +119,6 @@ static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq, trace_netfs_collect_folio(wreq, folio, fend, collected_to); - if (fpos + fsize > wreq->contiguity) { - trace_netfs_collect_contig(wreq, fpos + fsize, - netfs_contig_trace_unlock); - wreq->contiguity = fpos + fsize; - } - /* Unlock any folio we've transferred all of. */ if (collected_to < fend) break; @@ -155,9 +127,25 @@ static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq, wreq->cleaned_to = fpos + fsize; *notes |= MADE_PROGRESS; + /* Clean up the head folioq. If we clear an entire folioq, then + * we can get rid of it provided it's not also the tail folioq + * being filled by the issuer. + */ + folioq_clear(folioq, slot); + slot++; + if (slot >= folioq_nr_slots(folioq)) { + if (READ_ONCE(wreq->buffer_tail) == folioq) + break; + folioq = netfs_delete_buffer_head(wreq); + slot = 0; + } + if (fpos + fsize >= collected_to) break; } + + wreq->buffer = folioq; + wreq->buffer_head_slot = slot; } /* @@ -188,9 +176,12 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) break; if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) { + struct iov_iter source = subreq->io_iter; + + iov_iter_revert(&source, subreq->len - source.count); __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); - netfs_reissue_write(stream, subreq); + netfs_reissue_write(stream, subreq, &source); } } return; @@ -200,6 +191,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, do { struct netfs_io_subrequest *subreq = NULL, *from, *to, *tmp; + struct iov_iter source; unsigned long long start, len; size_t part; bool boundary = false; @@ -227,6 +219,13 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, len += to->len; } + /* Determine the set of buffers we're going to use. Each + * subreq gets a subset of a single overall contiguous buffer. + */ + netfs_reset_iter(from); + source = from->io_iter; + source.count = len; + /* Work through the sublist. */ subreq = from; list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) { @@ -238,7 +237,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); stream->prepare_write(subreq); - part = min(len, subreq->max_len); + part = min(len, stream->sreq_max_len); subreq->len = part; subreq->start = start; subreq->transferred = 0; @@ -249,7 +248,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, boundary = true; netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); - netfs_reissue_write(stream, subreq); + netfs_reissue_write(stream, subreq, &source); if (subreq == to) break; } @@ -278,8 +277,6 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, subreq = netfs_alloc_subrequest(wreq); subreq->source = to->source; subreq->start = start; - subreq->max_len = len; - subreq->max_nr_segs = INT_MAX; subreq->debug_index = atomic_inc_return(&wreq->subreq_counter); subreq->stream_nr = to->stream_nr; __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); @@ -293,10 +290,12 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, to = list_next_entry(to, rreq_link); trace_netfs_sreq(subreq, netfs_sreq_trace_retry); + stream->sreq_max_len = len; + stream->sreq_max_segs = INT_MAX; switch (stream->source) { case NETFS_UPLOAD_TO_SERVER: netfs_stat(&netfs_n_wh_upload); - subreq->max_len = min(len, wreq->wsize); + stream->sreq_max_len = umin(len, wreq->wsize); break; case NETFS_WRITE_TO_CACHE: netfs_stat(&netfs_n_wh_write); @@ -307,7 +306,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, stream->prepare_write(subreq); - part = min(len, subreq->max_len); + part = umin(len, stream->sreq_max_len); subreq->len = subreq->transferred + part; len -= part; start += part; @@ -316,7 +315,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, boundary = false; } - netfs_reissue_write(stream, subreq); + netfs_reissue_write(stream, subreq, &source); if (!len) break; @@ -377,7 +376,7 @@ static void netfs_collect_write_results(struct netfs_io_request *wreq) { struct netfs_io_subrequest *front, *remove; struct netfs_io_stream *stream; - unsigned long long collected_to; + unsigned long long collected_to, issued_to; unsigned int notes; int s; @@ -386,28 +385,22 @@ static void netfs_collect_write_results(struct netfs_io_request *wreq) trace_netfs_rreq(wreq, netfs_rreq_trace_collect); reassess_streams: + issued_to = atomic64_read(&wreq->issued_to); smp_rmb(); collected_to = ULLONG_MAX; - if (wreq->origin == NETFS_WRITEBACK) - notes = ALL_EMPTY | BUFFERED | MAYBE_DISCONTIG; - else if (wreq->origin == NETFS_WRITETHROUGH) - notes = ALL_EMPTY | BUFFERED; + if (wreq->origin == NETFS_WRITEBACK || + wreq->origin == NETFS_WRITETHROUGH || + wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE) + notes = BUFFERED; else - notes = ALL_EMPTY; + notes = 0; /* Remove completed subrequests from the front of the streams and * advance the completion point on each stream. We stop when we hit * something that's in progress. The issuer thread may be adding stuff * to the tail whilst we're doing this. - * - * We must not, however, merge in discontiguities that span whole - * folios that aren't under writeback. This is made more complicated - * by the folios in the gap being of unpredictable sizes - if they even - * exist - but we don't want to look them up. */ for (s = 0; s < NR_IO_STREAMS; s++) { - loff_t rstart, rend; - stream = &wreq->io_streams[s]; /* Read active flag before list pointers */ if (!smp_load_acquire(&stream->active)) @@ -419,26 +412,10 @@ reassess_streams: //_debug("sreq [%x] %llx %zx/%zx", // front->debug_index, front->start, front->transferred, front->len); - /* Stall if there may be a discontinuity. */ - rstart = round_down(front->start, PAGE_SIZE); - if (rstart > wreq->contiguity) { - if (wreq->contiguity > stream->collected_to) { - trace_netfs_collect_gap(wreq, stream, - wreq->contiguity, 'D'); - stream->collected_to = wreq->contiguity; - } - notes |= REASSESS_DISCONTIG; - break; + if (stream->collected_to < front->start) { + trace_netfs_collect_gap(wreq, stream, issued_to, 'F'); + stream->collected_to = front->start; } - rend = round_up(front->start + front->len, PAGE_SIZE); - if (rend > wreq->contiguity) { - trace_netfs_collect_contig(wreq, rend, - netfs_contig_trace_collect); - wreq->contiguity = rend; - if (notes & REASSESS_DISCONTIG) - notes |= NEED_REASSESS; - } - notes &= ~MAYBE_DISCONTIG; /* Stall if the front is still undergoing I/O. */ if (test_bit(NETFS_SREQ_IN_PROGRESS, &front->flags)) { @@ -473,33 +450,27 @@ reassess_streams: cancel: /* Remove if completely consumed. */ - spin_lock(&wreq->lock); + spin_lock_bh(&wreq->lock); remove = front; list_del_init(&front->rreq_link); front = list_first_entry_or_null(&stream->subrequests, struct netfs_io_subrequest, rreq_link); stream->front = front; - if (!front) { - unsigned long long jump_to = atomic64_read(&wreq->issued_to); - - if (stream->collected_to < jump_to) { - trace_netfs_collect_gap(wreq, stream, jump_to, 'A'); - stream->collected_to = jump_to; - } - } - - spin_unlock(&wreq->lock); + spin_unlock_bh(&wreq->lock); netfs_put_subrequest(remove, false, notes & SAW_FAILURE ? netfs_sreq_trace_put_cancel : netfs_sreq_trace_put_done); } - if (front) - notes &= ~ALL_EMPTY; - else - notes |= SOME_EMPTY; + /* If we have an empty stream, we need to jump it forward + * otherwise the collection point will never advance. + */ + if (!front && issued_to > stream->collected_to) { + trace_netfs_collect_gap(wreq, stream, issued_to, 'E'); + stream->collected_to = issued_to; + } if (stream->collected_to < collected_to) collected_to = stream->collected_to; @@ -508,36 +479,6 @@ reassess_streams: if (collected_to != ULLONG_MAX && collected_to > wreq->collected_to) wreq->collected_to = collected_to; - /* If we have an empty stream, we need to jump it forward over any gap - * otherwise the collection point will never advance. - * - * Note that the issuer always adds to the stream with the lowest - * so-far submitted start, so if we see two consecutive subreqs in one - * stream with nothing between then in another stream, then the second - * stream has a gap that can be jumped. - */ - if (notes & SOME_EMPTY) { - unsigned long long jump_to = wreq->start + READ_ONCE(wreq->submitted); - - for (s = 0; s < NR_IO_STREAMS; s++) { - stream = &wreq->io_streams[s]; - if (stream->active && - stream->front && - stream->front->start < jump_to) - jump_to = stream->front->start; - } - - for (s = 0; s < NR_IO_STREAMS; s++) { - stream = &wreq->io_streams[s]; - if (stream->active && - !stream->front && - stream->collected_to < jump_to) { - trace_netfs_collect_gap(wreq, stream, jump_to, 'B'); - stream->collected_to = jump_to; - } - } - } - for (s = 0; s < NR_IO_STREAMS; s++) { stream = &wreq->io_streams[s]; if (stream->active) @@ -548,43 +489,14 @@ reassess_streams: /* Unlock any folios that we have now finished with. */ if (notes & BUFFERED) { - unsigned long long clean_to = min(wreq->collected_to, wreq->contiguity); - - if (wreq->cleaned_to < clean_to) - netfs_writeback_unlock_folios(wreq, clean_to, ¬es); + if (wreq->cleaned_to < wreq->collected_to) + netfs_writeback_unlock_folios(wreq, ¬es); } else { wreq->cleaned_to = wreq->collected_to; } // TODO: Discard encryption buffers - /* If all streams are discontiguous with the last folio we cleared, we - * may need to skip a set of folios. - */ - if ((notes & (MAYBE_DISCONTIG | ALL_EMPTY)) == MAYBE_DISCONTIG) { - unsigned long long jump_to = ULLONG_MAX; - - for (s = 0; s < NR_IO_STREAMS; s++) { - stream = &wreq->io_streams[s]; - if (stream->active && stream->front && - stream->front->start < jump_to) - jump_to = stream->front->start; - } - - trace_netfs_collect_contig(wreq, jump_to, netfs_contig_trace_jump); - wreq->contiguity = jump_to; - wreq->cleaned_to = jump_to; - wreq->collected_to = jump_to; - for (s = 0; s < NR_IO_STREAMS; s++) { - stream = &wreq->io_streams[s]; - if (stream->collected_to < jump_to) - stream->collected_to = jump_to; - } - //cond_resched(); - notes |= MADE_PROGRESS; - goto reassess_streams; - } - if (notes & NEED_RETRY) goto need_retry; if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) { diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 3f7e37e50c7d..04e66d587f77 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -95,7 +95,8 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, struct netfs_io_request *wreq; struct netfs_inode *ictx; bool is_buffered = (origin == NETFS_WRITEBACK || - origin == NETFS_WRITETHROUGH); + origin == NETFS_WRITETHROUGH || + origin == NETFS_PGPRIV2_COPY_TO_CACHE); wreq = netfs_alloc_request(mapping, file, start, 0, origin); if (IS_ERR(wreq)) @@ -107,9 +108,7 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, if (is_buffered && netfs_is_cache_enabled(ictx)) fscache_begin_write_operation(&wreq->cache_resources, netfs_i_cookie(ictx)); - wreq->contiguity = wreq->start; wreq->cleaned_to = wreq->start; - INIT_WORK(&wreq->work, netfs_write_collection_worker); wreq->io_streams[0].stream_nr = 0; wreq->io_streams[0].source = NETFS_UPLOAD_TO_SERVER; @@ -158,22 +157,19 @@ static void netfs_prepare_write(struct netfs_io_request *wreq, subreq = netfs_alloc_subrequest(wreq); subreq->source = stream->source; subreq->start = start; - subreq->max_len = ULONG_MAX; - subreq->max_nr_segs = INT_MAX; subreq->stream_nr = stream->stream_nr; + subreq->io_iter = wreq->io_iter; _enter("R=%x[%x]", wreq->debug_id, subreq->debug_index); - trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index, - refcount_read(&subreq->ref), - netfs_sreq_trace_new); - trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); + stream->sreq_max_len = UINT_MAX; + stream->sreq_max_segs = INT_MAX; switch (stream->source) { case NETFS_UPLOAD_TO_SERVER: netfs_stat(&netfs_n_wh_upload); - subreq->max_len = wreq->wsize; + stream->sreq_max_len = wreq->wsize; break; case NETFS_WRITE_TO_CACHE: netfs_stat(&netfs_n_wh_write); @@ -192,7 +188,7 @@ static void netfs_prepare_write(struct netfs_io_request *wreq, * the list. The collector only goes nextwards and uses the lock to * remove entries off of the front. */ - spin_lock(&wreq->lock); + spin_lock_bh(&wreq->lock); list_add_tail(&subreq->rreq_link, &stream->subrequests); if (list_is_first(&subreq->rreq_link, &stream->subrequests)) { stream->front = subreq; @@ -203,7 +199,7 @@ static void netfs_prepare_write(struct netfs_io_request *wreq, } } - spin_unlock(&wreq->lock); + spin_unlock_bh(&wreq->lock); stream->construct = subreq; } @@ -223,41 +219,34 @@ static void netfs_do_issue_write(struct netfs_io_stream *stream, if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) return netfs_write_subrequest_terminated(subreq, subreq->error, false); - // TODO: Use encrypted buffer - if (test_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags)) { - subreq->io_iter = wreq->io_iter; - iov_iter_advance(&subreq->io_iter, - subreq->start + subreq->transferred - wreq->start); - iov_iter_truncate(&subreq->io_iter, - subreq->len - subreq->transferred); - } else { - iov_iter_xarray(&subreq->io_iter, ITER_SOURCE, &wreq->mapping->i_pages, - subreq->start + subreq->transferred, - subreq->len - subreq->transferred); - } - trace_netfs_sreq(subreq, netfs_sreq_trace_submit); stream->issue_write(subreq); } void netfs_reissue_write(struct netfs_io_stream *stream, - struct netfs_io_subrequest *subreq) + struct netfs_io_subrequest *subreq, + struct iov_iter *source) { + size_t size = subreq->len - subreq->transferred; + + // TODO: Use encrypted buffer + subreq->io_iter = *source; + iov_iter_advance(source, size); + iov_iter_truncate(&subreq->io_iter, size); + __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); netfs_do_issue_write(stream, subreq); } -static void netfs_issue_write(struct netfs_io_request *wreq, - struct netfs_io_stream *stream) +void netfs_issue_write(struct netfs_io_request *wreq, + struct netfs_io_stream *stream) { struct netfs_io_subrequest *subreq = stream->construct; if (!subreq) return; stream->construct = NULL; - - if (subreq->start + subreq->len > wreq->start + wreq->submitted) - WRITE_ONCE(wreq->submitted, subreq->start + subreq->len - wreq->start); + subreq->io_iter.count = subreq->len; netfs_do_issue_write(stream, subreq); } @@ -290,13 +279,14 @@ int netfs_advance_write(struct netfs_io_request *wreq, netfs_prepare_write(wreq, stream, start); subreq = stream->construct; - part = min(subreq->max_len - subreq->len, len); - _debug("part %zx/%zx %zx/%zx", subreq->len, subreq->max_len, part, len); + part = umin(stream->sreq_max_len - subreq->len, len); + _debug("part %zx/%zx %zx/%zx", subreq->len, stream->sreq_max_len, part, len); subreq->len += part; subreq->nr_segs++; + stream->submit_extendable_to -= part; - if (subreq->len >= subreq->max_len || - subreq->nr_segs >= subreq->max_nr_segs || + if (subreq->len >= stream->sreq_max_len || + subreq->nr_segs >= stream->sreq_max_segs || to_eof) { netfs_issue_write(wreq, stream); subreq = NULL; @@ -410,19 +400,26 @@ static int netfs_write_folio(struct netfs_io_request *wreq, folio_unlock(folio); if (fgroup == NETFS_FOLIO_COPY_TO_CACHE) { - if (!fscache_resources_valid(&wreq->cache_resources)) { + if (!cache->avail) { trace_netfs_folio(folio, netfs_folio_trace_cancel_copy); netfs_issue_write(wreq, upload); netfs_folio_written_back(folio); return 0; } trace_netfs_folio(folio, netfs_folio_trace_store_copy); + } else if (!upload->avail && !cache->avail) { + trace_netfs_folio(folio, netfs_folio_trace_cancel_store); + netfs_folio_written_back(folio); + return 0; } else if (!upload->construct) { trace_netfs_folio(folio, netfs_folio_trace_store); } else { trace_netfs_folio(folio, netfs_folio_trace_store_plus); } + /* Attach the folio to the rolling buffer. */ + netfs_buffer_append_folio(wreq, folio, false); + /* Move the submission point forward to allow for write-streaming data * not starting at the front of the page. We don't do write-streaming * with the cache as the cache requires DIO alignment. @@ -432,7 +429,6 @@ static int netfs_write_folio(struct netfs_io_request *wreq, */ for (int s = 0; s < NR_IO_STREAMS; s++) { stream = &wreq->io_streams[s]; - stream->submit_max_len = fsize; stream->submit_off = foff; stream->submit_len = flen; if ((stream->source == NETFS_WRITE_TO_CACHE && streamw) || @@ -440,7 +436,6 @@ static int netfs_write_folio(struct netfs_io_request *wreq, fgroup == NETFS_FOLIO_COPY_TO_CACHE)) { stream->submit_off = UINT_MAX; stream->submit_len = 0; - stream->submit_max_len = 0; } } @@ -467,12 +462,13 @@ static int netfs_write_folio(struct netfs_io_request *wreq, if (choose_s < 0) break; stream = &wreq->io_streams[choose_s]; + wreq->io_iter.iov_offset = stream->submit_off; + atomic64_set(&wreq->issued_to, fpos + stream->submit_off); + stream->submit_extendable_to = fsize - stream->submit_off; part = netfs_advance_write(wreq, stream, fpos + stream->submit_off, stream->submit_len, to_eof); - atomic64_set(&wreq->issued_to, fpos + stream->submit_off); stream->submit_off += part; - stream->submit_max_len -= part; if (part > stream->submit_len) stream->submit_len = 0; else @@ -481,6 +477,8 @@ static int netfs_write_folio(struct netfs_io_request *wreq, debug = true; } + wreq->io_iter.iov_offset = 0; + iov_iter_advance(&wreq->io_iter, fsize); atomic64_set(&wreq->issued_to, fpos + fsize); if (!debug) @@ -505,10 +503,14 @@ int netfs_writepages(struct address_space *mapping, struct folio *folio; int error = 0; - if (wbc->sync_mode == WB_SYNC_ALL) + if (!mutex_trylock(&ictx->wb_lock)) { + if (wbc->sync_mode == WB_SYNC_NONE) { + netfs_stat(&netfs_n_wb_lock_skip); + return 0; + } + netfs_stat(&netfs_n_wb_lock_wait); mutex_lock(&ictx->wb_lock); - else if (!mutex_trylock(&ictx->wb_lock)) - return 0; + } /* Need the first folio to be able to set up the op. */ folio = writeback_iter(mapping, wbc, NULL, &error); @@ -525,10 +527,10 @@ int netfs_writepages(struct address_space *mapping, netfs_stat(&netfs_n_wh_writepages); do { - _debug("wbiter %lx %llx", folio->index, wreq->start + wreq->submitted); + _debug("wbiter %lx %llx", folio->index, atomic64_read(&wreq->issued_to)); /* It appears we don't have to handle cyclic writeback wrapping. */ - WARN_ON_ONCE(wreq && folio_pos(folio) < wreq->start + wreq->submitted); + WARN_ON_ONCE(wreq && folio_pos(folio) < atomic64_read(&wreq->issued_to)); if (netfs_folio_group(folio) != NETFS_FOLIO_COPY_TO_CACHE && unlikely(!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))) { @@ -672,6 +674,7 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t part = netfs_advance_write(wreq, upload, start, len, false); start += part; len -= part; + iov_iter_advance(&wreq->io_iter, part); if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) { trace_netfs_rreq(wreq, netfs_rreq_trace_wait_pause); wait_on_bit(&wreq->flags, NETFS_RREQ_PAUSE, TASK_UNINTERRUPTIBLE); diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 7a558dea75c4..810269ee0a50 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -267,6 +267,7 @@ static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *fi rreq->debug_id = atomic_inc_return(&nfs_netfs_debug_id); /* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */ __set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags); + rreq->io_streams[0].sreq_max_len = NFS_SB(rreq->inode->i_sb)->rsize; return 0; } @@ -288,14 +289,6 @@ static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sre return netfs; } -static bool nfs_netfs_clamp_length(struct netfs_io_subrequest *sreq) -{ - size_t rsize = NFS_SB(sreq->rreq->inode->i_sb)->rsize; - - sreq->len = min(sreq->len, rsize); - return true; -} - static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq) { struct nfs_netfs_io_data *netfs; @@ -304,17 +297,18 @@ static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq) struct nfs_open_context *ctx = sreq->rreq->netfs_priv; struct page *page; unsigned long idx; + pgoff_t start, last; int err; - pgoff_t start = (sreq->start + sreq->transferred) >> PAGE_SHIFT; - pgoff_t last = ((sreq->start + sreq->len - - sreq->transferred - 1) >> PAGE_SHIFT); + + start = (sreq->start + sreq->transferred) >> PAGE_SHIFT; + last = ((sreq->start + sreq->len - sreq->transferred - 1) >> PAGE_SHIFT); nfs_pageio_init_read(&pgio, inode, false, &nfs_async_read_completion_ops); netfs = nfs_netfs_alloc(sreq); if (!netfs) - return netfs_subreq_terminated(sreq, -ENOMEM, false); + return netfs_read_subreq_terminated(sreq, -ENOMEM, false); pgio.pg_netfs = netfs; /* used in completion */ @@ -380,5 +374,4 @@ const struct netfs_request_ops nfs_netfs_ops = { .init_request = nfs_netfs_init_request, .free_request = nfs_netfs_free_request, .issue_read = nfs_netfs_issue_read, - .clamp_length = nfs_netfs_clamp_length }; diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index e8adae1bc260..772d485e96d3 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -60,8 +60,6 @@ static inline void nfs_netfs_get(struct nfs_netfs_io_data *netfs) static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs) { - ssize_t final_len; - /* Only the last RPC completion should call netfs_subreq_terminated() */ if (!refcount_dec_and_test(&netfs->refcount)) return; @@ -74,8 +72,9 @@ static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs) * Correct the final length here to be no larger than the netfs subrequest * length, and thus avoid netfs's "Subreq overread" warning message. */ - final_len = min_t(s64, netfs->sreq->len, atomic64_read(&netfs->transferred)); - netfs_subreq_terminated(netfs->sreq, netfs->error ?: final_len, false); + netfs->sreq->transferred = min_t(s64, netfs->sreq->len, + atomic64_read(&netfs->transferred)); + netfs_read_subreq_terminated(netfs->sreq, netfs->error, false); kfree(netfs); } static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c index b0473c2567fe..7481b21a0489 100644 --- a/fs/smb/client/cifsencrypt.c +++ b/fs/smb/client/cifsencrypt.c @@ -21,127 +21,21 @@ #include #include #include +#include #include "../common/arc4.h" #include -/* - * Hash data from a BVEC-type iterator. - */ -static int cifs_shash_bvec(const struct iov_iter *iter, ssize_t maxsize, - struct shash_desc *shash) +static size_t cifs_shash_step(void *iter_base, size_t progress, size_t len, + void *priv, void *priv2) { - const struct bio_vec *bv = iter->bvec; - unsigned long start = iter->iov_offset; - unsigned int i; - void *p; - int ret; + struct shash_desc *shash = priv; + int ret, *pret = priv2; - for (i = 0; i < iter->nr_segs; i++) { - size_t off, len; - - len = bv[i].bv_len; - if (start >= len) { - start -= len; - continue; - } - - len = min_t(size_t, maxsize, len - start); - off = bv[i].bv_offset + start; - - p = kmap_local_page(bv[i].bv_page); - ret = crypto_shash_update(shash, p + off, len); - kunmap_local(p); - if (ret < 0) - return ret; - - maxsize -= len; - if (maxsize <= 0) - break; - start = 0; + ret = crypto_shash_update(shash, iter_base, len); + if (ret < 0) { + *pret = ret; + return len; } - - return 0; -} - -/* - * Hash data from a KVEC-type iterator. - */ -static int cifs_shash_kvec(const struct iov_iter *iter, ssize_t maxsize, - struct shash_desc *shash) -{ - const struct kvec *kv = iter->kvec; - unsigned long start = iter->iov_offset; - unsigned int i; - int ret; - - for (i = 0; i < iter->nr_segs; i++) { - size_t len; - - len = kv[i].iov_len; - if (start >= len) { - start -= len; - continue; - } - - len = min_t(size_t, maxsize, len - start); - ret = crypto_shash_update(shash, kv[i].iov_base + start, len); - if (ret < 0) - return ret; - maxsize -= len; - - if (maxsize <= 0) - break; - start = 0; - } - - return 0; -} - -/* - * Hash data from an XARRAY-type iterator. - */ -static ssize_t cifs_shash_xarray(const struct iov_iter *iter, ssize_t maxsize, - struct shash_desc *shash) -{ - struct folio *folios[16], *folio; - unsigned int nr, i, j, npages; - loff_t start = iter->xarray_start + iter->iov_offset; - pgoff_t last, index = start / PAGE_SIZE; - ssize_t ret = 0; - size_t len, offset, foffset; - void *p; - - if (maxsize == 0) - return 0; - - last = (start + maxsize - 1) / PAGE_SIZE; - do { - nr = xa_extract(iter->xarray, (void **)folios, index, last, - ARRAY_SIZE(folios), XA_PRESENT); - if (nr == 0) - return -EIO; - - for (i = 0; i < nr; i++) { - folio = folios[i]; - npages = folio_nr_pages(folio); - foffset = start - folio_pos(folio); - offset = foffset % PAGE_SIZE; - for (j = foffset / PAGE_SIZE; j < npages; j++) { - len = min_t(size_t, maxsize, PAGE_SIZE - offset); - p = kmap_local_page(folio_page(folio, j)); - ret = crypto_shash_update(shash, p + offset, len); - kunmap_local(p); - if (ret < 0) - return ret; - maxsize -= len; - if (maxsize <= 0) - return 0; - start += len; - offset = 0; - index++; - } - } - } while (nr == ARRAY_SIZE(folios)); return 0; } @@ -151,21 +45,13 @@ static ssize_t cifs_shash_xarray(const struct iov_iter *iter, ssize_t maxsize, static int cifs_shash_iter(const struct iov_iter *iter, size_t maxsize, struct shash_desc *shash) { - if (maxsize == 0) - return 0; + struct iov_iter tmp_iter = *iter; + int err = -EIO; - switch (iov_iter_type(iter)) { - case ITER_BVEC: - return cifs_shash_bvec(iter, maxsize, shash); - case ITER_KVEC: - return cifs_shash_kvec(iter, maxsize, shash); - case ITER_XARRAY: - return cifs_shash_xarray(iter, maxsize, shash); - default: - pr_err("cifs_shash_iter(%u) unsupported\n", iov_iter_type(iter)); - WARN_ON_ONCE(1); - return -EIO; - } + if (iterate_and_advance_kernel(&tmp_iter, maxsize, shash, &err, + cifs_shash_step) != maxsize) + return err; + return 0; } int __cifs_calc_signature(struct smb_rqst *rqst, diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 9eae8649f90c..2709c7b0be85 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -255,7 +255,7 @@ struct smb_rqst { struct kvec *rq_iov; /* array of kvecs */ unsigned int rq_nvec; /* number of kvecs in array */ struct iov_iter rq_iter; /* Data iterator */ - struct xarray rq_buffer; /* Page buffer for encryption */ + struct folio_queue *rq_buffer; /* Buffer for encryption */ }; struct mid_q_entry; @@ -1485,7 +1485,6 @@ struct cifs_io_subrequest { struct cifs_io_request *req; }; ssize_t got_bytes; - size_t actual_len; unsigned int xid; int result; bool have_xid; @@ -1550,7 +1549,6 @@ struct cifsInodeInfo { #define CIFS_INO_DELETE_PENDING (3) /* delete pending on server */ #define CIFS_INO_INVALID_MAPPING (4) /* pagecache is invalid */ #define CIFS_INO_LOCK (5) /* lock bit for synchronization */ -#define CIFS_INO_MODIFIED_ATTR (6) /* Indicate change in mtime/ctime */ #define CIFS_INO_CLOSE_ON_LOCK (7) /* Not to defer the close when lock is set */ unsigned long flags; spinlock_t writers_lock; diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index cfae2e918209..d81da161d3ed 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -1266,9 +1266,7 @@ static void cifs_readv_worker(struct work_struct *work) struct cifs_io_subrequest *rdata = container_of(work, struct cifs_io_subrequest, subreq.work); - netfs_subreq_terminated(&rdata->subreq, - (rdata->result == 0 || rdata->result == -EAGAIN) ? - rdata->got_bytes : rdata->result, true); + netfs_read_subreq_terminated(&rdata->subreq, rdata->result, true); } static void @@ -1327,15 +1325,16 @@ cifs_readv_callback(struct mid_q_entry *mid) __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); rdata->result = 0; } else { - if (rdata->got_bytes < rdata->actual_len && - rdata->subreq.start + rdata->subreq.transferred + rdata->got_bytes == - ictx->remote_i_size) { + size_t trans = rdata->subreq.transferred + rdata->got_bytes; + if (trans < rdata->subreq.len && + rdata->subreq.start + trans == ictx->remote_i_size) { __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); rdata->result = 0; } } rdata->credits.value = 0; + rdata->subreq.transferred += rdata->got_bytes; INIT_WORK(&rdata->subreq.work, cifs_readv_worker); queue_work(cifsiod_wq, &rdata->subreq.work); release_mid(mid); diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 2d387485f05b..bcde3f9c89e0 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -49,6 +49,7 @@ static void cifs_prepare_write(struct netfs_io_subrequest *subreq) struct cifs_io_subrequest *wdata = container_of(subreq, struct cifs_io_subrequest, subreq); struct cifs_io_request *req = wdata->req; + struct netfs_io_stream *stream = &req->rreq.io_streams[subreq->stream_nr]; struct TCP_Server_Info *server; struct cifsFileInfo *open_file = req->cfile; size_t wsize = req->rreq.wsize; @@ -73,7 +74,7 @@ retry: } } - rc = server->ops->wait_mtu_credits(server, wsize, &wdata->subreq.max_len, + rc = server->ops->wait_mtu_credits(server, wsize, &stream->sreq_max_len, &wdata->credits); if (rc < 0) { subreq->error = rc; @@ -92,7 +93,7 @@ retry: #ifdef CONFIG_CIFS_SMB_DIRECT if (server->smbd_conn) - subreq->max_nr_segs = server->smbd_conn->max_frmr_depth; + stream->sreq_max_segs = server->smbd_conn->max_frmr_depth; #endif } @@ -111,7 +112,6 @@ static void cifs_issue_write(struct netfs_io_subrequest *subreq) goto fail; } - wdata->actual_len = wdata->subreq.len; rc = adjust_credits(wdata->server, wdata, cifs_trace_rw_credits_issue_write_adjust); if (rc) goto fail; @@ -140,25 +140,22 @@ static void cifs_netfs_invalidate_cache(struct netfs_io_request *wreq) } /* - * Split the read up according to how many credits we can get for each piece. - * It's okay to sleep here if we need to wait for more credit to become - * available. - * - * We also choose the server and allocate an operation ID to be cleaned up - * later. + * Negotiate the size of a read operation on behalf of the netfs library. */ -static bool cifs_clamp_length(struct netfs_io_subrequest *subreq) +static int cifs_prepare_read(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq); struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq); struct TCP_Server_Info *server = req->server; struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb); - size_t rsize; - int rc; + size_t size; + int rc = 0; - rdata->xid = get_xid(); - rdata->have_xid = true; + if (!rdata->have_xid) { + rdata->xid = get_xid(); + rdata->have_xid = true; + } rdata->server = server; if (cifs_sb->ctx->rsize == 0) @@ -166,13 +163,12 @@ static bool cifs_clamp_length(struct netfs_io_subrequest *subreq) server->ops->negotiate_rsize(tlink_tcon(req->cfile->tlink), cifs_sb->ctx); - rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize, - &rsize, &rdata->credits); - if (rc) { - subreq->error = rc; - return false; - } + &size, &rdata->credits); + if (rc) + return rc; + + rreq->io_streams[0].sreq_max_len = size; rdata->credits.in_flight_check = 1; rdata->credits.rreq_debug_id = rreq->debug_id; @@ -184,14 +180,11 @@ static bool cifs_clamp_length(struct netfs_io_subrequest *subreq) server->credits, server->in_flight, 0, cifs_trace_rw_credits_read_submit); - subreq->len = umin(subreq->len, rsize); - rdata->actual_len = subreq->len; - #ifdef CONFIG_CIFS_SMB_DIRECT if (server->smbd_conn) - subreq->max_nr_segs = server->smbd_conn->max_frmr_depth; + rreq->io_streams[0].sreq_max_segs = server->smbd_conn->max_frmr_depth; #endif - return true; + return 0; } /* @@ -200,59 +193,41 @@ static bool cifs_clamp_length(struct netfs_io_subrequest *subreq) * to only read a portion of that, but as long as we read something, the netfs * helper will call us again so that we can issue another read. */ -static void cifs_req_issue_read(struct netfs_io_subrequest *subreq) +static void cifs_issue_read(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq); struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq); struct TCP_Server_Info *server = req->server; - struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb); int rc = 0; cifs_dbg(FYI, "%s: op=%08x[%x] mapping=%p len=%zu/%zu\n", __func__, rreq->debug_id, subreq->debug_index, rreq->mapping, subreq->transferred, subreq->len); - if (test_bit(NETFS_SREQ_RETRYING, &subreq->flags)) { - /* - * As we're issuing a retry, we need to negotiate some new - * credits otherwise the server may reject the op with - * INVALID_PARAMETER. Note, however, we may get back less - * credit than we need to complete the op, in which case, we - * shorten the op and rely on additional rounds of retry. - */ - size_t rsize = umin(subreq->len - subreq->transferred, - cifs_sb->ctx->rsize); - - rc = server->ops->wait_mtu_credits(server, rsize, &rdata->actual_len, - &rdata->credits); - if (rc) - goto out; - - rdata->credits.in_flight_check = 1; - - trace_smb3_rw_credits(rdata->rreq->debug_id, - rdata->subreq.debug_index, - rdata->credits.value, - server->credits, server->in_flight, 0, - cifs_trace_rw_credits_read_resubmit); - } + rc = adjust_credits(server, rdata, cifs_trace_rw_credits_issue_read_adjust); + if (rc) + goto failed; if (req->cfile->invalidHandle) { do { rc = cifs_reopen_file(req->cfile, true); } while (rc == -EAGAIN); if (rc) - goto out; + goto failed; } if (subreq->rreq->origin != NETFS_DIO_READ) __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); rc = rdata->server->ops->async_readv(rdata); -out: if (rc) - netfs_subreq_terminated(subreq, rc, false); + goto failed; + return; + +failed: + netfs_read_subreq_terminated(subreq, rc, false); } /* @@ -316,12 +291,6 @@ static void cifs_rreq_done(struct netfs_io_request *rreq) inode_set_atime_to_ts(inode, inode_get_mtime(inode)); } -static void cifs_post_modify(struct inode *inode) -{ - /* Indication to update ctime and mtime as close is deferred */ - set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags); -} - static void cifs_free_request(struct netfs_io_request *rreq) { struct cifs_io_request *req = container_of(rreq, struct cifs_io_request, rreq); @@ -369,10 +338,9 @@ const struct netfs_request_ops cifs_req_ops = { .init_request = cifs_init_request, .free_request = cifs_free_request, .free_subrequest = cifs_free_subrequest, - .clamp_length = cifs_clamp_length, - .issue_read = cifs_req_issue_read, + .prepare_read = cifs_prepare_read, + .issue_read = cifs_issue_read, .done = cifs_rreq_done, - .post_modify = cifs_post_modify, .begin_writeback = cifs_begin_writeback, .prepare_write = cifs_prepare_write, .issue_write = cifs_issue_write, @@ -1396,7 +1364,7 @@ int cifs_close(struct inode *inode, struct file *file) dclose = kmalloc(sizeof(struct cifs_deferred_close), GFP_KERNEL); if ((cfile->status_file_deleted == false) && (smb2_can_defer_close(inode, dclose))) { - if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) { + if (test_and_clear_bit(NETFS_ICTX_MODIFIED_ATTR, &cinode->netfs.flags)) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); } diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index e6540072ffb0..159a063de6dd 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "cifsfs.h" #include "cifsglob.h" @@ -301,7 +302,8 @@ smb2_adjust_credits(struct TCP_Server_Info *server, unsigned int /*enum smb3_rw_credits_trace*/ trace) { struct cifs_credits *credits = &subreq->credits; - int new_val = DIV_ROUND_UP(subreq->actual_len, SMB2_MAX_BUFFER_SIZE); + int new_val = DIV_ROUND_UP(subreq->subreq.len - subreq->subreq.transferred, + SMB2_MAX_BUFFER_SIZE); int scredits, in_flight; if (!credits->value || credits->value == new_val) @@ -4392,30 +4394,86 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, } /* - * Clear a read buffer, discarding the folios which have XA_MARK_0 set. + * Clear a read buffer, discarding the folios which have the 1st mark set. */ -static void cifs_clear_xarray_buffer(struct xarray *buffer) +static void cifs_clear_folioq_buffer(struct folio_queue *buffer) { - struct folio *folio; + struct folio_queue *folioq; - XA_STATE(xas, buffer, 0); - - rcu_read_lock(); - xas_for_each_marked(&xas, folio, ULONG_MAX, XA_MARK_0) { - folio_put(folio); + while ((folioq = buffer)) { + for (int s = 0; s < folioq_count(folioq); s++) + if (folioq_is_marked(folioq, s)) + folio_put(folioq_folio(folioq, s)); + buffer = folioq->next; + kfree(folioq); } - rcu_read_unlock(); - xa_destroy(buffer); +} + +/* + * Allocate buffer space into a folio queue. + */ +static struct folio_queue *cifs_alloc_folioq_buffer(ssize_t size) +{ + struct folio_queue *buffer = NULL, *tail = NULL, *p; + struct folio *folio; + unsigned int slot; + + do { + if (!tail || folioq_full(tail)) { + p = kmalloc(sizeof(*p), GFP_NOFS); + if (!p) + goto nomem; + folioq_init(p); + if (tail) { + tail->next = p; + p->prev = tail; + } else { + buffer = p; + } + tail = p; + } + + folio = folio_alloc(GFP_KERNEL|__GFP_HIGHMEM, 0); + if (!folio) + goto nomem; + + slot = folioq_append_mark(tail, folio); + size -= folioq_folio_size(tail, slot); + } while (size > 0); + + return buffer; + +nomem: + cifs_clear_folioq_buffer(buffer); + return NULL; +} + +/* + * Copy data from an iterator to the folios in a folio queue buffer. + */ +static bool cifs_copy_iter_to_folioq(struct iov_iter *iter, size_t size, + struct folio_queue *buffer) +{ + for (; buffer; buffer = buffer->next) { + for (int s = 0; s < folioq_count(buffer); s++) { + struct folio *folio = folioq_folio(buffer, s); + size_t part = folioq_folio_size(buffer, s); + + part = umin(part, size); + + if (copy_folio_from_iter(folio, 0, part, iter) != part) + return false; + size -= part; + } + } + return true; } void smb3_free_compound_rqst(int num_rqst, struct smb_rqst *rqst) { - int i; - - for (i = 0; i < num_rqst; i++) - if (!xa_empty(&rqst[i].rq_buffer)) - cifs_clear_xarray_buffer(&rqst[i].rq_buffer); + for (int i = 0; i < num_rqst; i++) + cifs_clear_folioq_buffer(rqst[i].rq_buffer); } /* @@ -4436,52 +4494,32 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst, struct smb_rqst *new_rq, struct smb_rqst *old_rq) { struct smb2_transform_hdr *tr_hdr = new_rq[0].rq_iov[0].iov_base; - struct page *page; unsigned int orig_len = 0; - int i, j; int rc = -ENOMEM; - for (i = 1; i < num_rqst; i++) { + for (int i = 1; i < num_rqst; i++) { struct smb_rqst *old = &old_rq[i - 1]; struct smb_rqst *new = &new_rq[i]; - struct xarray *buffer = &new->rq_buffer; - size_t size = iov_iter_count(&old->rq_iter), seg, copied = 0; + struct folio_queue *buffer; + size_t size = iov_iter_count(&old->rq_iter); orig_len += smb_rqst_len(server, old); new->rq_iov = old->rq_iov; new->rq_nvec = old->rq_nvec; - xa_init(buffer); - if (size > 0) { - unsigned int npages = DIV_ROUND_UP(size, PAGE_SIZE); + buffer = cifs_alloc_folioq_buffer(size); + if (!buffer) + goto err_free; - for (j = 0; j < npages; j++) { - void *o; + new->rq_buffer = buffer; + iov_iter_folio_queue(&new->rq_iter, ITER_SOURCE, + buffer, 0, 0, size); - rc = -ENOMEM; - page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); - if (!page) - goto err_free; - page->index = j; - o = xa_store(buffer, j, page, GFP_KERNEL); - if (xa_is_err(o)) { - rc = xa_err(o); - put_page(page); - goto err_free; - } - - xa_set_mark(buffer, j, XA_MARK_0); - - seg = min_t(size_t, size - copied, PAGE_SIZE); - if (copy_page_from_iter(page, 0, seg, &old->rq_iter) != seg) { - rc = -EFAULT; - goto err_free; - } - copied += seg; + if (!cifs_copy_iter_to_folioq(&old->rq_iter, size, buffer)) { + rc = -EIO; + goto err_free; } - iov_iter_xarray(&new->rq_iter, ITER_SOURCE, - buffer, 0, size); } } @@ -4545,22 +4583,23 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, } static int -cifs_copy_pages_to_iter(struct xarray *pages, unsigned int data_size, - unsigned int skip, struct iov_iter *iter) +cifs_copy_folioq_to_iter(struct folio_queue *folioq, size_t data_size, + size_t skip, struct iov_iter *iter) { - struct page *page; - unsigned long index; + for (; folioq; folioq = folioq->next) { + for (int s = 0; s < folioq_count(folioq); s++) { + struct folio *folio = folioq_folio(folioq, s); + size_t fsize = folio_size(folio); + size_t n, len = umin(fsize - skip, data_size); - xa_for_each(pages, index, page) { - size_t n, len = min_t(unsigned int, PAGE_SIZE - skip, data_size); - - n = copy_page_to_iter(page, skip, len, iter); - if (n != len) { - cifs_dbg(VFS, "%s: something went wrong\n", __func__); - return -EIO; + n = copy_folio_to_iter(folio, skip, len, iter); + if (n != len) { + cifs_dbg(VFS, "%s: something went wrong\n", __func__); + return -EIO; + } + data_size -= n; + skip = 0; } - data_size -= n; - skip = 0; } return 0; @@ -4568,8 +4607,8 @@ cifs_copy_pages_to_iter(struct xarray *pages, unsigned int data_size, static int handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, - char *buf, unsigned int buf_len, struct xarray *pages, - unsigned int pages_len, bool is_offloaded) + char *buf, unsigned int buf_len, struct folio_queue *buffer, + unsigned int buffer_len, bool is_offloaded) { unsigned int data_offset; unsigned int data_len; @@ -4666,7 +4705,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, return 0; } - if (data_len > pages_len - pad_len) { + if (data_len > buffer_len - pad_len) { /* data_len is corrupt -- discard frame */ rdata->result = -EIO; if (is_offloaded) @@ -4677,8 +4716,8 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, } /* Copy the data to the output I/O iterator. */ - rdata->result = cifs_copy_pages_to_iter(pages, pages_len, - cur_off, &rdata->subreq.io_iter); + rdata->result = cifs_copy_folioq_to_iter(buffer, buffer_len, + cur_off, &rdata->subreq.io_iter); if (rdata->result != 0) { if (is_offloaded) mid->mid_state = MID_RESPONSE_MALFORMED; @@ -4686,12 +4725,11 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, dequeue_mid(mid, rdata->result); return 0; } - rdata->got_bytes = pages_len; + rdata->got_bytes = buffer_len; } else if (buf_len >= data_offset + data_len) { /* read response payload is in buf */ - WARN_ONCE(pages && !xa_empty(pages), - "read data can be either in buf or in pages"); + WARN_ONCE(buffer, "read data can be either in buf or in buffer"); length = copy_to_iter(buf + data_offset, data_len, &rdata->subreq.io_iter); if (length < 0) return length; @@ -4717,7 +4755,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, struct smb2_decrypt_work { struct work_struct decrypt; struct TCP_Server_Info *server; - struct xarray buffer; + struct folio_queue *buffer; char *buf; unsigned int len; }; @@ -4731,7 +4769,7 @@ static void smb2_decrypt_offload(struct work_struct *work) struct mid_q_entry *mid; struct iov_iter iter; - iov_iter_xarray(&iter, ITER_DEST, &dw->buffer, 0, dw->len); + iov_iter_folio_queue(&iter, ITER_DEST, dw->buffer, 0, 0, dw->len); rc = decrypt_raw_data(dw->server, dw->buf, dw->server->vals->read_rsp_size, &iter, true); if (rc) { @@ -4747,7 +4785,7 @@ static void smb2_decrypt_offload(struct work_struct *work) mid->decrypted = true; rc = handle_read_data(dw->server, mid, dw->buf, dw->server->vals->read_rsp_size, - &dw->buffer, dw->len, + dw->buffer, dw->len, true); if (rc >= 0) { #ifdef CONFIG_CIFS_STATS2 @@ -4780,7 +4818,7 @@ static void smb2_decrypt_offload(struct work_struct *work) } free_pages: - cifs_clear_xarray_buffer(&dw->buffer); + cifs_clear_folioq_buffer(dw->buffer); cifs_small_buf_release(dw->buf); kfree(dw); } @@ -4790,20 +4828,17 @@ static int receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid, int *num_mids) { - struct page *page; char *buf = server->smallbuf; struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf; struct iov_iter iter; - unsigned int len, npages; + unsigned int len; unsigned int buflen = server->pdu_size; int rc; - int i = 0; struct smb2_decrypt_work *dw; dw = kzalloc(sizeof(struct smb2_decrypt_work), GFP_KERNEL); if (!dw) return -ENOMEM; - xa_init(&dw->buffer); INIT_WORK(&dw->decrypt, smb2_decrypt_offload); dw->server = server; @@ -4819,26 +4854,14 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid, len = le32_to_cpu(tr_hdr->OriginalMessageSize) - server->vals->read_rsp_size; dw->len = len; - npages = DIV_ROUND_UP(len, PAGE_SIZE); + len = round_up(dw->len, PAGE_SIZE); rc = -ENOMEM; - for (; i < npages; i++) { - void *old; + dw->buffer = cifs_alloc_folioq_buffer(len); + if (!dw->buffer) + goto discard_data; - page = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); - if (!page) - goto discard_data; - page->index = i; - old = xa_store(&dw->buffer, i, page, GFP_KERNEL); - if (xa_is_err(old)) { - rc = xa_err(old); - put_page(page); - goto discard_data; - } - xa_set_mark(&dw->buffer, i, XA_MARK_0); - } - - iov_iter_xarray(&iter, ITER_DEST, &dw->buffer, 0, npages * PAGE_SIZE); + iov_iter_folio_queue(&iter, ITER_DEST, dw->buffer, 0, 0, len); /* Read the data into the buffer and clear excess bufferage. */ rc = cifs_read_iter_from_socket(server, &iter, dw->len); @@ -4846,9 +4869,9 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid, goto discard_data; server->total_read += rc; - if (rc < npages * PAGE_SIZE) - iov_iter_zero(npages * PAGE_SIZE - rc, &iter); - iov_iter_revert(&iter, npages * PAGE_SIZE); + if (rc < len) + iov_iter_zero(len - rc, &iter); + iov_iter_revert(&iter, len); iov_iter_truncate(&iter, dw->len); rc = cifs_discard_remaining_data(server); @@ -4883,7 +4906,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid, (*mid)->decrypted = true; rc = handle_read_data(server, *mid, buf, server->vals->read_rsp_size, - &dw->buffer, dw->len, false); + dw->buffer, dw->len, false); if (rc >= 0) { if (server->ops->is_network_name_deleted) { server->ops->is_network_name_deleted(buf, @@ -4893,7 +4916,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid, } free_pages: - cifs_clear_xarray_buffer(&dw->buffer); + cifs_clear_folioq_buffer(dw->buffer); free_dw: kfree(dw); return rc; diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 88dc49d67037..95377bb91950 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -4498,9 +4498,7 @@ static void smb2_readv_worker(struct work_struct *work) struct cifs_io_subrequest *rdata = container_of(work, struct cifs_io_subrequest, subreq.work); - netfs_subreq_terminated(&rdata->subreq, - (rdata->result == 0 || rdata->result == -EAGAIN) ? - rdata->got_bytes : rdata->result, true); + netfs_read_subreq_terminated(&rdata->subreq, rdata->result, false); } static void @@ -4532,7 +4530,7 @@ smb2_readv_callback(struct mid_q_entry *mid) cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%zu/%zu\n", __func__, mid->mid, mid->mid_state, rdata->result, - rdata->actual_len, rdata->subreq.len - rdata->subreq.transferred); + rdata->got_bytes, rdata->subreq.len - rdata->subreq.transferred); switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: @@ -4554,6 +4552,7 @@ smb2_readv_callback(struct mid_q_entry *mid) break; case MID_REQUEST_SUBMITTED: case MID_RETRY_NEEDED: + __set_bit(NETFS_SREQ_NEED_RETRY, &rdata->subreq.flags); rdata->result = -EAGAIN; if (server->sign && rdata->got_bytes) /* reset bytes number since we can not check a sign */ @@ -4588,7 +4587,7 @@ smb2_readv_callback(struct mid_q_entry *mid) rdata->req->cfile->fid.persistent_fid, tcon->tid, tcon->ses->Suid, rdata->subreq.start + rdata->subreq.transferred, - rdata->actual_len, + rdata->subreq.len - rdata->subreq.transferred, rdata->result); } else trace_smb3_read_done(rdata->rreq->debug_id, @@ -4603,9 +4602,9 @@ smb2_readv_callback(struct mid_q_entry *mid) __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); rdata->result = 0; } else { - if (rdata->got_bytes < rdata->actual_len && - rdata->subreq.start + rdata->subreq.transferred + rdata->got_bytes == - ictx->remote_i_size) { + size_t trans = rdata->subreq.transferred + rdata->got_bytes; + if (trans < rdata->subreq.len && + rdata->subreq.start + trans == ictx->remote_i_size) { __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); rdata->result = 0; } @@ -4614,6 +4613,10 @@ smb2_readv_callback(struct mid_q_entry *mid) server->credits, server->in_flight, 0, cifs_trace_rw_credits_read_response_clear); rdata->credits.value = 0; + rdata->subreq.transferred += rdata->got_bytes; + if (rdata->subreq.start + rdata->subreq.transferred >= rdata->subreq.rreq->i_size) + __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); + trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_progress); INIT_WORK(&rdata->subreq.work, smb2_readv_worker); queue_work(cifsiod_wq, &rdata->subreq.work); release_mid(mid); @@ -4648,7 +4651,7 @@ smb2_async_readv(struct cifs_io_subrequest *rdata) io_parms.tcon = tlink_tcon(rdata->req->cfile->tlink); io_parms.server = server = rdata->server; io_parms.offset = subreq->start + subreq->transferred; - io_parms.length = rdata->actual_len; + io_parms.length = subreq->len - subreq->transferred; io_parms.persistent_fid = rdata->req->cfile->fid.persistent_fid; io_parms.volatile_fid = rdata->req->cfile->fid.volatile_fid; io_parms.pid = rdata->req->pid; @@ -4669,7 +4672,7 @@ smb2_async_readv(struct cifs_io_subrequest *rdata) shdr = (struct smb2_hdr *)buf; if (rdata->credits.value > 0) { - shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->actual_len, + shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(io_parms.length, SMB2_MAX_BUFFER_SIZE)); credit_request = le16_to_cpu(shdr->CreditCharge) + 8; if (server->credits >= server->max_credits) @@ -4697,7 +4700,8 @@ smb2_async_readv(struct cifs_io_subrequest *rdata) rdata->xid, io_parms.persistent_fid, io_parms.tcon->tid, io_parms.tcon->ses->Suid, - io_parms.offset, rdata->actual_len, rc); + io_parms.offset, + subreq->len - subreq->transferred, rc); } async_readv_out: @@ -4880,6 +4884,7 @@ smb2_writev_callback(struct mid_q_entry *mid) server->credits, server->in_flight, 0, cifs_trace_rw_credits_write_response_clear); wdata->credits.value = 0; + trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_progress); cifs_write_subrequest_terminated(wdata, result ?: written, true); release_mid(mid); trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, 0, diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 7bcc379014ca..80262a36030f 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -6,6 +6,7 @@ */ #include #include +#include #include "smbdirect.h" #include "cifs_debug.h" #include "cifsproto.h" @@ -2463,6 +2464,8 @@ static ssize_t smb_extract_bvec_to_rdma(struct iov_iter *iter, start = 0; } + if (ret > 0) + iov_iter_advance(iter, ret); return ret; } @@ -2519,50 +2522,65 @@ static ssize_t smb_extract_kvec_to_rdma(struct iov_iter *iter, start = 0; } + if (ret > 0) + iov_iter_advance(iter, ret); return ret; } /* - * Extract folio fragments from an XARRAY-class iterator and add them to an - * RDMA list. The folios are not pinned. + * Extract folio fragments from a FOLIOQ-class iterator and add them to an RDMA + * list. The folios are not pinned. */ -static ssize_t smb_extract_xarray_to_rdma(struct iov_iter *iter, +static ssize_t smb_extract_folioq_to_rdma(struct iov_iter *iter, struct smb_extract_to_rdma *rdma, ssize_t maxsize) { - struct xarray *xa = iter->xarray; - struct folio *folio; - loff_t start = iter->xarray_start + iter->iov_offset; - pgoff_t index = start / PAGE_SIZE; + const struct folio_queue *folioq = iter->folioq; + unsigned int slot = iter->folioq_slot; ssize_t ret = 0; - size_t off, len; - XA_STATE(xas, xa, index); + size_t offset = iter->iov_offset; - rcu_read_lock(); + BUG_ON(!folioq); - xas_for_each(&xas, folio, ULONG_MAX) { - if (xas_retry(&xas, folio)) - continue; - if (WARN_ON(xa_is_value(folio))) - break; - if (WARN_ON(folio_test_hugetlb(folio))) - break; - - off = offset_in_folio(folio, start); - len = min_t(size_t, maxsize, folio_size(folio) - off); - - if (!smb_set_sge(rdma, folio_page(folio, 0), off, len)) { - rcu_read_unlock(); + if (slot >= folioq_nr_slots(folioq)) { + folioq = folioq->next; + if (WARN_ON_ONCE(!folioq)) return -EIO; - } - - maxsize -= len; - ret += len; - if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0) - break; + slot = 0; } - rcu_read_unlock(); + do { + struct folio *folio = folioq_folio(folioq, slot); + size_t fsize = folioq_folio_size(folioq, slot); + + if (offset < fsize) { + size_t part = umin(maxsize - ret, fsize - offset); + + if (!smb_set_sge(rdma, folio_page(folio, 0), offset, part)) + return -EIO; + + offset += part; + ret += part; + } + + if (offset >= fsize) { + offset = 0; + slot++; + if (slot >= folioq_nr_slots(folioq)) { + if (!folioq->next) { + WARN_ON_ONCE(ret < iter->count); + break; + } + folioq = folioq->next; + slot = 0; + } + } + } while (rdma->nr_sge < rdma->max_sge || maxsize > 0); + + iter->folioq = folioq; + iter->folioq_slot = slot; + iter->iov_offset = offset; + iter->count -= ret; return ret; } @@ -2590,17 +2608,15 @@ static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len, case ITER_KVEC: ret = smb_extract_kvec_to_rdma(iter, rdma, len); break; - case ITER_XARRAY: - ret = smb_extract_xarray_to_rdma(iter, rdma, len); + case ITER_FOLIOQ: + ret = smb_extract_folioq_to_rdma(iter, rdma, len); break; default: WARN_ON_ONCE(1); return -EIO; } - if (ret > 0) { - iov_iter_advance(iter, ret); - } else if (ret < 0) { + if (ret < 0) { while (rdma->nr_sge > before) { struct ib_sge *sge = &rdma->sge[rdma->nr_sge--]; diff --git a/include/linux/folio_queue.h b/include/linux/folio_queue.h new file mode 100644 index 000000000000..955680c3bb5f --- /dev/null +++ b/include/linux/folio_queue.h @@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Queue of folios definitions + * + * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#ifndef _LINUX_FOLIO_QUEUE_H +#define _LINUX_FOLIO_QUEUE_H + +#include + +/* + * Segment in a queue of running buffers. Each segment can hold a number of + * folios and a portion of the queue can be referenced with the ITER_FOLIOQ + * iterator. The possibility exists of inserting non-folio elements into the + * queue (such as gaps). + * + * Explicit prev and next pointers are used instead of a list_head to make it + * easier to add segments to tail and remove them from the head without the + * need for a lock. + */ +struct folio_queue { + struct folio_batch vec; /* Folios in the queue segment */ + u8 orders[PAGEVEC_SIZE]; /* Order of each folio */ + struct folio_queue *next; /* Next queue segment or NULL */ + struct folio_queue *prev; /* Previous queue segment of NULL */ + unsigned long marks; /* 1-bit mark per folio */ + unsigned long marks2; /* Second 1-bit mark per folio */ + unsigned long marks3; /* Third 1-bit mark per folio */ +#if PAGEVEC_SIZE > BITS_PER_LONG +#error marks is not big enough +#endif +}; + +static inline void folioq_init(struct folio_queue *folioq) +{ + folio_batch_init(&folioq->vec); + folioq->next = NULL; + folioq->prev = NULL; + folioq->marks = 0; + folioq->marks2 = 0; + folioq->marks3 = 0; +} + +static inline unsigned int folioq_nr_slots(const struct folio_queue *folioq) +{ + return PAGEVEC_SIZE; +} + +static inline unsigned int folioq_count(struct folio_queue *folioq) +{ + return folio_batch_count(&folioq->vec); +} + +static inline bool folioq_full(struct folio_queue *folioq) +{ + //return !folio_batch_space(&folioq->vec); + return folioq_count(folioq) >= folioq_nr_slots(folioq); +} + +static inline bool folioq_is_marked(const struct folio_queue *folioq, unsigned int slot) +{ + return test_bit(slot, &folioq->marks); +} + +static inline void folioq_mark(struct folio_queue *folioq, unsigned int slot) +{ + set_bit(slot, &folioq->marks); +} + +static inline void folioq_unmark(struct folio_queue *folioq, unsigned int slot) +{ + clear_bit(slot, &folioq->marks); +} + +static inline bool folioq_is_marked2(const struct folio_queue *folioq, unsigned int slot) +{ + return test_bit(slot, &folioq->marks2); +} + +static inline void folioq_mark2(struct folio_queue *folioq, unsigned int slot) +{ + set_bit(slot, &folioq->marks2); +} + +static inline void folioq_unmark2(struct folio_queue *folioq, unsigned int slot) +{ + clear_bit(slot, &folioq->marks2); +} + +static inline bool folioq_is_marked3(const struct folio_queue *folioq, unsigned int slot) +{ + return test_bit(slot, &folioq->marks3); +} + +static inline void folioq_mark3(struct folio_queue *folioq, unsigned int slot) +{ + set_bit(slot, &folioq->marks3); +} + +static inline void folioq_unmark3(struct folio_queue *folioq, unsigned int slot) +{ + clear_bit(slot, &folioq->marks3); +} + +static inline unsigned int __folio_order(struct folio *folio) +{ + if (!folio_test_large(folio)) + return 0; + return folio->_flags_1 & 0xff; +} + +static inline unsigned int folioq_append(struct folio_queue *folioq, struct folio *folio) +{ + unsigned int slot = folioq->vec.nr++; + + folioq->vec.folios[slot] = folio; + folioq->orders[slot] = __folio_order(folio); + return slot; +} + +static inline unsigned int folioq_append_mark(struct folio_queue *folioq, struct folio *folio) +{ + unsigned int slot = folioq->vec.nr++; + + folioq->vec.folios[slot] = folio; + folioq->orders[slot] = __folio_order(folio); + folioq_mark(folioq, slot); + return slot; +} + +static inline struct folio *folioq_folio(const struct folio_queue *folioq, unsigned int slot) +{ + return folioq->vec.folios[slot]; +} + +static inline unsigned int folioq_folio_order(const struct folio_queue *folioq, unsigned int slot) +{ + return folioq->orders[slot]; +} + +static inline size_t folioq_folio_size(const struct folio_queue *folioq, unsigned int slot) +{ + return PAGE_SIZE << folioq_folio_order(folioq, slot); +} + +static inline void folioq_clear(struct folio_queue *folioq, unsigned int slot) +{ + folioq->vec.folios[slot] = NULL; + folioq_unmark(folioq, slot); + folioq_unmark2(folioq, slot); + folioq_unmark3(folioq, slot); +} + +#endif /* _LINUX_FOLIO_QUEUE_H */ diff --git a/include/linux/iov_iter.h b/include/linux/iov_iter.h index 270454a6703d..c4aa58032faf 100644 --- a/include/linux/iov_iter.h +++ b/include/linux/iov_iter.h @@ -10,6 +10,7 @@ #include #include +#include typedef size_t (*iov_step_f)(void *iter_base, size_t progress, size_t len, void *priv, void *priv2); @@ -140,6 +141,60 @@ size_t iterate_bvec(struct iov_iter *iter, size_t len, void *priv, void *priv2, return progress; } +/* + * Handle ITER_FOLIOQ. + */ +static __always_inline +size_t iterate_folioq(struct iov_iter *iter, size_t len, void *priv, void *priv2, + iov_step_f step) +{ + const struct folio_queue *folioq = iter->folioq; + unsigned int slot = iter->folioq_slot; + size_t progress = 0, skip = iter->iov_offset; + + if (slot == folioq_nr_slots(folioq)) { + /* The iterator may have been extended. */ + folioq = folioq->next; + slot = 0; + } + + do { + struct folio *folio = folioq_folio(folioq, slot); + size_t part, remain, consumed; + size_t fsize; + void *base; + + if (!folio) + break; + + fsize = folioq_folio_size(folioq, slot); + base = kmap_local_folio(folio, skip); + part = umin(len, PAGE_SIZE - skip % PAGE_SIZE); + remain = step(base, progress, part, priv, priv2); + kunmap_local(base); + consumed = part - remain; + len -= consumed; + progress += consumed; + skip += consumed; + if (skip >= fsize) { + skip = 0; + slot++; + if (slot == folioq_nr_slots(folioq) && folioq->next) { + folioq = folioq->next; + slot = 0; + } + } + if (remain) + break; + } while (len); + + iter->folioq_slot = slot; + iter->folioq = folioq; + iter->iov_offset = skip; + iter->count -= progress; + return progress; +} + /* * Handle ITER_XARRAY. */ @@ -249,6 +304,8 @@ size_t iterate_and_advance2(struct iov_iter *iter, size_t len, void *priv, return iterate_bvec(iter, len, priv, priv2, step); if (iov_iter_is_kvec(iter)) return iterate_kvec(iter, len, priv, priv2, step); + if (iov_iter_is_folioq(iter)) + return iterate_folioq(iter, len, priv, priv2, step); if (iov_iter_is_xarray(iter)) return iterate_xarray(iter, len, priv, priv2, step); return iterate_discard(iter, len, priv, priv2, step); @@ -271,4 +328,51 @@ size_t iterate_and_advance(struct iov_iter *iter, size_t len, void *priv, return iterate_and_advance2(iter, len, priv, NULL, ustep, step); } +/** + * iterate_and_advance_kernel - Iterate over a kernel-internal iterator + * @iter: The iterator to iterate over. + * @len: The amount to iterate over. + * @priv: Data for the step functions. + * @priv2: More data for the step functions. + * @step: Function for other iterators; given kernel addresses. + * + * Iterate over the next part of an iterator, up to the specified length. The + * buffer is presented in segments, which for kernel iteration are broken up by + * physical pages and mapped, with the mapped address being presented. + * + * [!] Note This will only handle BVEC, KVEC, FOLIOQ, XARRAY and DISCARD-type + * iterators; it will not handle UBUF or IOVEC-type iterators. + * + * A step functions, @step, must be provided, one for handling mapped kernel + * addresses and the other is given user addresses which have the potential to + * fault since no pinning is performed. + * + * The step functions are passed the address and length of the segment, @priv, + * @priv2 and the amount of data so far iterated over (which can, for example, + * be added to @priv to point to the right part of a second buffer). The step + * functions should return the amount of the segment they didn't process (ie. 0 + * indicates complete processsing). + * + * This function returns the amount of data processed (ie. 0 means nothing was + * processed and the value of @len means processes to completion). + */ +static __always_inline +size_t iterate_and_advance_kernel(struct iov_iter *iter, size_t len, void *priv, + void *priv2, iov_step_f step) +{ + if (unlikely(iter->count < len)) + len = iter->count; + if (unlikely(!len)) + return 0; + if (iov_iter_is_bvec(iter)) + return iterate_bvec(iter, len, priv, priv2, step); + if (iov_iter_is_kvec(iter)) + return iterate_kvec(iter, len, priv, priv2, step); + if (iov_iter_is_folioq(iter)) + return iterate_folioq(iter, len, priv, priv2, step); + if (iov_iter_is_xarray(iter)) + return iterate_xarray(iter, len, priv, priv2, step); + return iterate_discard(iter, len, priv, priv2, step); +} + #endif /* _LINUX_IOV_ITER_H */ diff --git a/include/linux/netfs.h b/include/linux/netfs.h index c47443e7a97e..5eaceef41e6c 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -38,11 +38,8 @@ static inline void folio_start_private_2(struct folio *folio) folio_set_private_2(folio); } -/* Marks used on xarray-based buffers */ -#define NETFS_BUF_PUT_MARK XA_MARK_0 /* - Page needs putting */ -#define NETFS_BUF_PAGECACHE_MARK XA_MARK_1 /* - Page needs wb/dirty flag wrangling */ - enum netfs_io_source { + NETFS_SOURCE_UNKNOWN, NETFS_FILL_WITH_ZEROES, NETFS_DOWNLOAD_FROM_SERVER, NETFS_READ_FROM_CACHE, @@ -73,6 +70,7 @@ struct netfs_inode { #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ #define NETFS_ICTX_WRITETHROUGH 2 /* Write-through caching */ +#define NETFS_ICTX_MODIFIED_ATTR 3 /* Indicate change in mtime/ctime */ }; /* @@ -133,9 +131,11 @@ static inline struct netfs_group *netfs_folio_group(struct folio *folio) struct netfs_io_stream { /* Submission tracking */ struct netfs_io_subrequest *construct; /* Op being constructed */ + size_t sreq_max_len; /* Maximum size of a subrequest */ + unsigned int sreq_max_segs; /* 0 or max number of segments in an iterator */ unsigned int submit_off; /* Folio offset we're submitting from */ unsigned int submit_len; /* Amount of data left to submit */ - unsigned int submit_max_len; /* Amount I/O can be rounded up to */ + unsigned int submit_extendable_to; /* Amount I/O can be rounded up to */ void (*prepare_write)(struct netfs_io_subrequest *subreq); void (*issue_write)(struct netfs_io_subrequest *subreq); /* Collection tracking */ @@ -176,41 +176,45 @@ struct netfs_io_subrequest { struct list_head rreq_link; /* Link in rreq->subrequests */ struct iov_iter io_iter; /* Iterator for this subrequest */ unsigned long long start; /* Where to start the I/O */ - size_t max_len; /* Maximum size of the I/O */ size_t len; /* Size of the I/O */ size_t transferred; /* Amount of data transferred */ + size_t consumed; /* Amount of read data consumed */ + size_t prev_donated; /* Amount of data donated from previous subreq */ + size_t next_donated; /* Amount of data donated from next subreq */ refcount_t ref; short error; /* 0 or error that occurred */ unsigned short debug_index; /* Index in list (for debugging output) */ unsigned int nr_segs; /* Number of segs in io_iter */ - unsigned int max_nr_segs; /* 0 or max number of segments in an iterator */ enum netfs_io_source source; /* Where to read from/write to */ unsigned char stream_nr; /* I/O stream this belongs to */ + unsigned char curr_folioq_slot; /* Folio currently being read */ + unsigned char curr_folio_order; /* Order of folio */ + struct folio_queue *curr_folioq; /* Queue segment in which current folio resides */ unsigned long flags; #define NETFS_SREQ_COPY_TO_CACHE 0 /* Set if should copy the data to the cache */ #define NETFS_SREQ_CLEAR_TAIL 1 /* Set if the rest of the read should be cleared */ -#define NETFS_SREQ_SHORT_IO 2 /* Set if the I/O was short */ #define NETFS_SREQ_SEEK_DATA_READ 3 /* Set if ->read() should SEEK_DATA first */ #define NETFS_SREQ_NO_PROGRESS 4 /* Set if we didn't manage to read any data */ #define NETFS_SREQ_ONDEMAND 5 /* Set if it's from on-demand read mode */ #define NETFS_SREQ_BOUNDARY 6 /* Set if ends on hard boundary (eg. ceph object) */ +#define NETFS_SREQ_HIT_EOF 7 /* Set if short due to EOF */ #define NETFS_SREQ_IN_PROGRESS 8 /* Unlocked when the subrequest completes */ #define NETFS_SREQ_NEED_RETRY 9 /* Set if the filesystem requests a retry */ #define NETFS_SREQ_RETRYING 10 /* Set if we're retrying */ #define NETFS_SREQ_FAILED 11 /* Set if the subreq failed unretryably */ -#define NETFS_SREQ_HIT_EOF 12 /* Set if we hit the EOF */ }; enum netfs_io_origin { NETFS_READAHEAD, /* This read was triggered by readahead */ NETFS_READPAGE, /* This read is a synchronous read */ + NETFS_READ_GAPS, /* This read is a synchronous read to fill gaps */ NETFS_READ_FOR_WRITE, /* This read is to prepare a write */ - NETFS_COPY_TO_CACHE, /* This write is to copy a read to the cache */ + NETFS_DIO_READ, /* This is a direct I/O read */ NETFS_WRITEBACK, /* This write was triggered by writepages */ NETFS_WRITETHROUGH, /* This write was made by netfs_perform_write() */ NETFS_UNBUFFERED_WRITE, /* This is an unbuffered write */ - NETFS_DIO_READ, /* This is a direct I/O read */ NETFS_DIO_WRITE, /* This is a direct I/O write */ + NETFS_PGPRIV2_COPY_TO_CACHE, /* [DEPRECATED] This is writing read data to the cache */ nr__netfs_io_origin } __mode(byte); @@ -227,11 +231,14 @@ struct netfs_io_request { struct address_space *mapping; /* The mapping being accessed */ struct kiocb *iocb; /* AIO completion vector */ struct netfs_cache_resources cache_resources; + struct readahead_control *ractl; /* Readahead descriptor */ struct list_head proc_link; /* Link in netfs_iorequests */ struct list_head subrequests; /* Contributory I/O operations */ struct netfs_io_stream io_streams[2]; /* Streams of parallel I/O operations */ #define NR_IO_STREAMS 2 //wreq->nr_io_streams struct netfs_group *group; /* Writeback group being written back */ + struct folio_queue *buffer; /* Head of I/O buffer */ + struct folio_queue *buffer_tail; /* Tail of I/O buffer */ struct iov_iter iter; /* Unencrypted-side iterator */ struct iov_iter io_iter; /* I/O (Encrypted-side) iterator */ void *netfs_priv; /* Private data for the netfs */ @@ -245,24 +252,23 @@ struct netfs_io_request { unsigned int nr_group_rel; /* Number of refs to release on ->group */ spinlock_t lock; /* Lock for queuing subreqs */ atomic_t nr_outstanding; /* Number of ops in progress */ - atomic_t nr_copy_ops; /* Number of copy-to-cache ops in progress */ - size_t upper_len; /* Length can be extended to here */ unsigned long long submitted; /* Amount submitted for I/O so far */ unsigned long long len; /* Length of the request */ size_t transferred; /* Amount to be indicated as transferred */ - short error; /* 0 or error that occurred */ + long error; /* 0 or error that occurred */ enum netfs_io_origin origin; /* Origin of the request */ bool direct_bv_unpin; /* T if direct_bv[] must be unpinned */ + u8 buffer_head_slot; /* First slot in ->buffer */ + u8 buffer_tail_slot; /* Next slot in ->buffer_tail */ unsigned long long i_size; /* Size of the file */ unsigned long long start; /* Start position */ atomic64_t issued_to; /* Write issuer folio cursor */ - unsigned long long contiguity; /* Tracking for gaps in the writeback sequence */ unsigned long long collected_to; /* Point we've collected to */ unsigned long long cleaned_to; /* Position we've cleaned folios to */ pgoff_t no_unlock_folio; /* Don't unlock this folio after read */ + size_t prev_donated; /* Fallback for subreq->prev_donated */ refcount_t ref; unsigned long flags; -#define NETFS_RREQ_INCOMPLETE_IO 0 /* Some ioreqs terminated short or with error */ #define NETFS_RREQ_COPY_TO_CACHE 1 /* Need to write to the cache */ #define NETFS_RREQ_NO_UNLOCK_FOLIO 2 /* Don't unlock no_unlock_folio on completion */ #define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3 /* Don't unlock the folios on completion */ @@ -274,6 +280,7 @@ struct netfs_io_request { #define NETFS_RREQ_PAUSE 11 /* Pause subrequest generation */ #define NETFS_RREQ_USE_IO_ITER 12 /* Use ->io_iter rather than ->i_pages */ #define NETFS_RREQ_ALL_QUEUED 13 /* All subreqs are now queued */ +#define NETFS_RREQ_NEED_RETRY 14 /* Need to try retrying */ #define NETFS_RREQ_USE_PGPRIV2 31 /* [DEPRECATED] Use PG_private_2 to mark * write to cache on read */ const struct netfs_request_ops *netfs_ops; @@ -292,7 +299,7 @@ struct netfs_request_ops { /* Read request handling */ void (*expand_readahead)(struct netfs_io_request *rreq); - bool (*clamp_length)(struct netfs_io_subrequest *subreq); + int (*prepare_read)(struct netfs_io_subrequest *subreq); void (*issue_read)(struct netfs_io_subrequest *subreq); bool (*is_still_valid)(struct netfs_io_request *rreq); int (*check_write_begin)(struct file *file, loff_t pos, unsigned len, @@ -422,7 +429,10 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp); vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group); /* (Sub)request management API. */ -void netfs_subreq_terminated(struct netfs_io_subrequest *, ssize_t, bool); +void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq, + bool was_async); +void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, + int error, bool was_async); void netfs_get_subrequest(struct netfs_io_subrequest *subreq, enum netfs_sreq_ref_trace what); void netfs_put_subrequest(struct netfs_io_subrequest *subreq, diff --git a/include/linux/uio.h b/include/linux/uio.h index 7020adedfa08..853f9de5aa05 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -11,6 +11,7 @@ #include struct page; +struct folio_queue; typedef unsigned int __bitwise iov_iter_extraction_t; @@ -25,6 +26,7 @@ enum iter_type { ITER_IOVEC, ITER_BVEC, ITER_KVEC, + ITER_FOLIOQ, ITER_XARRAY, ITER_DISCARD, }; @@ -66,6 +68,7 @@ struct iov_iter { const struct iovec *__iov; const struct kvec *kvec; const struct bio_vec *bvec; + const struct folio_queue *folioq; struct xarray *xarray; void __user *ubuf; }; @@ -74,6 +77,7 @@ struct iov_iter { }; union { unsigned long nr_segs; + u8 folioq_slot; loff_t xarray_start; }; }; @@ -126,6 +130,11 @@ static inline bool iov_iter_is_discard(const struct iov_iter *i) return iov_iter_type(i) == ITER_DISCARD; } +static inline bool iov_iter_is_folioq(const struct iov_iter *i) +{ + return iov_iter_type(i) == ITER_FOLIOQ; +} + static inline bool iov_iter_is_xarray(const struct iov_iter *i) { return iov_iter_type(i) == ITER_XARRAY; @@ -180,6 +189,12 @@ static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset, return copy_page_to_iter(&folio->page, offset, bytes, i); } +static inline size_t copy_folio_from_iter(struct folio *folio, size_t offset, + size_t bytes, struct iov_iter *i) +{ + return copy_page_from_iter(&folio->page, offset, bytes, i); +} + static inline size_t copy_folio_from_iter_atomic(struct folio *folio, size_t offset, size_t bytes, struct iov_iter *i) { @@ -273,6 +288,9 @@ void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count); void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count); +void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction, + const struct folio_queue *folioq, + unsigned int first_slot, unsigned int offset, size_t count); void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray, loff_t start, size_t count); ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 606b4a0f92da..76bd42a96815 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -20,6 +20,7 @@ EM(netfs_read_trace_expanded, "EXPANDED ") \ EM(netfs_read_trace_readahead, "READAHEAD") \ EM(netfs_read_trace_readpage, "READPAGE ") \ + EM(netfs_read_trace_read_gaps, "READ-GAPS") \ EM(netfs_read_trace_prefetch_for_write, "PREFETCHW") \ E_(netfs_read_trace_write_begin, "WRITEBEGN") @@ -33,13 +34,14 @@ #define netfs_rreq_origins \ EM(NETFS_READAHEAD, "RA") \ EM(NETFS_READPAGE, "RP") \ + EM(NETFS_READ_GAPS, "RG") \ EM(NETFS_READ_FOR_WRITE, "RW") \ - EM(NETFS_COPY_TO_CACHE, "CC") \ + EM(NETFS_DIO_READ, "DR") \ EM(NETFS_WRITEBACK, "WB") \ EM(NETFS_WRITETHROUGH, "WT") \ EM(NETFS_UNBUFFERED_WRITE, "UW") \ - EM(NETFS_DIO_READ, "DR") \ - E_(NETFS_DIO_WRITE, "DW") + EM(NETFS_DIO_WRITE, "DW") \ + E_(NETFS_PGPRIV2_COPY_TO_CACHE, "2C") #define netfs_rreq_traces \ EM(netfs_rreq_trace_assess, "ASSESS ") \ @@ -60,6 +62,7 @@ E_(netfs_rreq_trace_write_done, "WR-DONE") #define netfs_sreq_sources \ + EM(NETFS_SOURCE_UNKNOWN, "----") \ EM(NETFS_FILL_WITH_ZEROES, "ZERO") \ EM(NETFS_DOWNLOAD_FROM_SERVER, "DOWN") \ EM(NETFS_READ_FROM_CACHE, "READ") \ @@ -69,15 +72,25 @@ E_(NETFS_INVALID_WRITE, "INVL") #define netfs_sreq_traces \ + EM(netfs_sreq_trace_add_donations, "+DON ") \ + EM(netfs_sreq_trace_added, "ADD ") \ + EM(netfs_sreq_trace_clear, "CLEAR") \ EM(netfs_sreq_trace_discard, "DSCRD") \ + EM(netfs_sreq_trace_donate_to_prev, "DON-P") \ + EM(netfs_sreq_trace_donate_to_next, "DON-N") \ EM(netfs_sreq_trace_download_instead, "RDOWN") \ EM(netfs_sreq_trace_fail, "FAIL ") \ EM(netfs_sreq_trace_free, "FREE ") \ + EM(netfs_sreq_trace_hit_eof, "EOF ") \ + EM(netfs_sreq_trace_io_progress, "IO ") \ EM(netfs_sreq_trace_limited, "LIMIT") \ EM(netfs_sreq_trace_prepare, "PREP ") \ EM(netfs_sreq_trace_prep_failed, "PRPFL") \ - EM(netfs_sreq_trace_resubmit_short, "SHORT") \ + EM(netfs_sreq_trace_progress, "PRGRS") \ + EM(netfs_sreq_trace_reprep_failed, "REPFL") \ EM(netfs_sreq_trace_retry, "RETRY") \ + EM(netfs_sreq_trace_short, "SHORT") \ + EM(netfs_sreq_trace_split, "SPLIT") \ EM(netfs_sreq_trace_submit, "SUBMT") \ EM(netfs_sreq_trace_terminated, "TERM ") \ EM(netfs_sreq_trace_write, "WRITE") \ @@ -118,7 +131,7 @@ EM(netfs_sreq_trace_new, "NEW ") \ EM(netfs_sreq_trace_put_cancel, "PUT CANCEL ") \ EM(netfs_sreq_trace_put_clear, "PUT CLEAR ") \ - EM(netfs_sreq_trace_put_discard, "PUT DISCARD") \ + EM(netfs_sreq_trace_put_consumed, "PUT CONSUME") \ EM(netfs_sreq_trace_put_done, "PUT DONE ") \ EM(netfs_sreq_trace_put_failed, "PUT FAILED ") \ EM(netfs_sreq_trace_put_merged, "PUT MERGED ") \ @@ -129,7 +142,6 @@ E_(netfs_sreq_trace_put_terminated, "PUT TERM ") #define netfs_folio_traces \ - /* The first few correspond to enum netfs_how_to_modify */ \ EM(netfs_folio_is_uptodate, "mod-uptodate") \ EM(netfs_just_prefetch, "mod-prefetch") \ EM(netfs_whole_folio_modify, "mod-whole-f") \ @@ -139,8 +151,9 @@ EM(netfs_flush_content, "flush") \ EM(netfs_streaming_filled_page, "mod-streamw-f") \ EM(netfs_streaming_cont_filled_page, "mod-streamw-f+") \ - /* The rest are for writeback */ \ + EM(netfs_folio_trace_abandon, "abandon") \ EM(netfs_folio_trace_cancel_copy, "cancel-copy") \ + EM(netfs_folio_trace_cancel_store, "cancel-store") \ EM(netfs_folio_trace_clear, "clear") \ EM(netfs_folio_trace_clear_cc, "clear-cc") \ EM(netfs_folio_trace_clear_g, "clear-g") \ @@ -155,7 +168,12 @@ EM(netfs_folio_trace_mkwrite, "mkwrite") \ EM(netfs_folio_trace_mkwrite_plus, "mkwrite+") \ EM(netfs_folio_trace_not_under_wback, "!wback") \ + EM(netfs_folio_trace_put, "put") \ + EM(netfs_folio_trace_read, "read") \ + EM(netfs_folio_trace_read_done, "read-done") \ EM(netfs_folio_trace_read_gaps, "read-gaps") \ + EM(netfs_folio_trace_read_put, "read-put") \ + EM(netfs_folio_trace_read_unlock, "read-unlock") \ EM(netfs_folio_trace_redirtied, "redirtied") \ EM(netfs_folio_trace_store, "store") \ EM(netfs_folio_trace_store_copy, "store-copy") \ @@ -168,6 +186,12 @@ EM(netfs_contig_trace_jump, "-->JUMP-->") \ E_(netfs_contig_trace_unlock, "Unlock") +#define netfs_donate_traces \ + EM(netfs_trace_donate_tail_to_prev, "tail-to-prev") \ + EM(netfs_trace_donate_to_prev, "to-prev") \ + EM(netfs_trace_donate_to_next, "to-next") \ + E_(netfs_trace_donate_to_deferred_next, "defer-next") + #ifndef __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY #define __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY @@ -185,6 +209,7 @@ enum netfs_rreq_ref_trace { netfs_rreq_ref_traces } __mode(byte); enum netfs_sreq_ref_trace { netfs_sreq_ref_traces } __mode(byte); enum netfs_folio_trace { netfs_folio_traces } __mode(byte); enum netfs_collect_contig_trace { netfs_collect_contig_traces } __mode(byte); +enum netfs_donate_trace { netfs_donate_traces } __mode(byte); #endif @@ -207,6 +232,7 @@ netfs_rreq_ref_traces; netfs_sreq_ref_traces; netfs_folio_traces; netfs_collect_contig_traces; +netfs_donate_traces; /* * Now redefine the EM() and E_() macros to map the enums to the strings that @@ -227,6 +253,7 @@ TRACE_EVENT(netfs_read, TP_STRUCT__entry( __field(unsigned int, rreq ) __field(unsigned int, cookie ) + __field(loff_t, i_size ) __field(loff_t, start ) __field(size_t, len ) __field(enum netfs_read_trace, what ) @@ -236,18 +263,19 @@ TRACE_EVENT(netfs_read, TP_fast_assign( __entry->rreq = rreq->debug_id; __entry->cookie = rreq->cache_resources.debug_id; + __entry->i_size = rreq->i_size; __entry->start = start; __entry->len = len; __entry->what = what; __entry->netfs_inode = rreq->inode->i_ino; ), - TP_printk("R=%08x %s c=%08x ni=%x s=%llx %zx", + TP_printk("R=%08x %s c=%08x ni=%x s=%llx l=%zx sz=%llx", __entry->rreq, __print_symbolic(__entry->what, netfs_read_traces), __entry->cookie, __entry->netfs_inode, - __entry->start, __entry->len) + __entry->start, __entry->len, __entry->i_size) ); TRACE_EVENT(netfs_rreq, @@ -513,33 +541,6 @@ TRACE_EVENT(netfs_collect, __entry->start + __entry->len) ); -TRACE_EVENT(netfs_collect_contig, - TP_PROTO(const struct netfs_io_request *wreq, unsigned long long to, - enum netfs_collect_contig_trace type), - - TP_ARGS(wreq, to, type), - - TP_STRUCT__entry( - __field(unsigned int, wreq) - __field(enum netfs_collect_contig_trace, type) - __field(unsigned long long, contiguity) - __field(unsigned long long, to) - ), - - TP_fast_assign( - __entry->wreq = wreq->debug_id; - __entry->type = type; - __entry->contiguity = wreq->contiguity; - __entry->to = to; - ), - - TP_printk("R=%08x %llx -> %llx %s", - __entry->wreq, - __entry->contiguity, - __entry->to, - __print_symbolic(__entry->type, netfs_collect_contig_traces)) - ); - TRACE_EVENT(netfs_collect_sreq, TP_PROTO(const struct netfs_io_request *wreq, const struct netfs_io_subrequest *subreq), @@ -611,7 +612,6 @@ TRACE_EVENT(netfs_collect_state, __field(unsigned int, notes ) __field(unsigned long long, collected_to ) __field(unsigned long long, cleaned_to ) - __field(unsigned long long, contiguity ) ), TP_fast_assign( @@ -619,12 +619,11 @@ TRACE_EVENT(netfs_collect_state, __entry->notes = notes; __entry->collected_to = collected_to; __entry->cleaned_to = wreq->cleaned_to; - __entry->contiguity = wreq->contiguity; ), - TP_printk("R=%08x cto=%llx fto=%llx ctg=%llx n=%x", + TP_printk("R=%08x col=%llx cln=%llx n=%x", __entry->wreq, __entry->collected_to, - __entry->cleaned_to, __entry->contiguity, + __entry->cleaned_to, __entry->notes) ); @@ -681,6 +680,71 @@ TRACE_EVENT(netfs_collect_stream, __entry->collected_to, __entry->front) ); +TRACE_EVENT(netfs_progress, + TP_PROTO(const struct netfs_io_subrequest *subreq, + unsigned long long start, size_t avail, size_t part), + + TP_ARGS(subreq, start, avail, part), + + TP_STRUCT__entry( + __field(unsigned int, rreq) + __field(unsigned int, subreq) + __field(unsigned int, consumed) + __field(unsigned int, transferred) + __field(unsigned long long, f_start) + __field(unsigned int, f_avail) + __field(unsigned int, f_part) + __field(unsigned char, slot) + ), + + TP_fast_assign( + __entry->rreq = subreq->rreq->debug_id; + __entry->subreq = subreq->debug_index; + __entry->consumed = subreq->consumed; + __entry->transferred = subreq->transferred; + __entry->f_start = start; + __entry->f_avail = avail; + __entry->f_part = part; + __entry->slot = subreq->curr_folioq_slot; + ), + + TP_printk("R=%08x[%02x] s=%llx ct=%x/%x pa=%x/%x sl=%x", + __entry->rreq, __entry->subreq, __entry->f_start, + __entry->consumed, __entry->transferred, + __entry->f_part, __entry->f_avail, __entry->slot) + ); + +TRACE_EVENT(netfs_donate, + TP_PROTO(const struct netfs_io_request *rreq, + const struct netfs_io_subrequest *from, + const struct netfs_io_subrequest *to, + size_t amount, + enum netfs_donate_trace trace), + + TP_ARGS(rreq, from, to, amount, trace), + + TP_STRUCT__entry( + __field(unsigned int, rreq) + __field(unsigned int, from) + __field(unsigned int, to) + __field(unsigned int, amount) + __field(enum netfs_donate_trace, trace) + ), + + TP_fast_assign( + __entry->rreq = rreq->debug_id; + __entry->from = from->debug_index; + __entry->to = to ? to->debug_index : -1; + __entry->amount = amount; + __entry->trace = trace; + ), + + TP_printk("R=%08x[%02x] -> [%02x] %s am=%x", + __entry->rreq, __entry->from, __entry->to, + __print_symbolic(__entry->trace, netfs_donate_traces), + __entry->amount) + ); + #undef EM #undef E_ #endif /* _TRACE_NETFS_H */ diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 4a6a9f419bd7..97003155bfac 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -527,6 +527,39 @@ static void iov_iter_iovec_advance(struct iov_iter *i, size_t size) i->__iov = iov; } +static void iov_iter_folioq_advance(struct iov_iter *i, size_t size) +{ + const struct folio_queue *folioq = i->folioq; + unsigned int slot = i->folioq_slot; + + if (!i->count) + return; + i->count -= size; + + if (slot >= folioq_nr_slots(folioq)) { + folioq = folioq->next; + slot = 0; + } + + size += i->iov_offset; /* From beginning of current segment. */ + do { + size_t fsize = folioq_folio_size(folioq, slot); + + if (likely(size < fsize)) + break; + size -= fsize; + slot++; + if (slot >= folioq_nr_slots(folioq) && folioq->next) { + folioq = folioq->next; + slot = 0; + } + } while (size); + + i->iov_offset = size; + i->folioq_slot = slot; + i->folioq = folioq; +} + void iov_iter_advance(struct iov_iter *i, size_t size) { if (unlikely(i->count < size)) @@ -539,12 +572,40 @@ void iov_iter_advance(struct iov_iter *i, size_t size) iov_iter_iovec_advance(i, size); } else if (iov_iter_is_bvec(i)) { iov_iter_bvec_advance(i, size); + } else if (iov_iter_is_folioq(i)) { + iov_iter_folioq_advance(i, size); } else if (iov_iter_is_discard(i)) { i->count -= size; } } EXPORT_SYMBOL(iov_iter_advance); +static void iov_iter_folioq_revert(struct iov_iter *i, size_t unroll) +{ + const struct folio_queue *folioq = i->folioq; + unsigned int slot = i->folioq_slot; + + for (;;) { + size_t fsize; + + if (slot == 0) { + folioq = folioq->prev; + slot = folioq_nr_slots(folioq); + } + slot--; + + fsize = folioq_folio_size(folioq, slot); + if (unroll <= fsize) { + i->iov_offset = fsize - unroll; + break; + } + unroll -= fsize; + } + + i->folioq_slot = slot; + i->folioq = folioq; +} + void iov_iter_revert(struct iov_iter *i, size_t unroll) { if (!unroll) @@ -576,6 +637,9 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll) } unroll -= n; } + } else if (iov_iter_is_folioq(i)) { + i->iov_offset = 0; + iov_iter_folioq_revert(i, unroll); } else { /* same logics for iovec and kvec */ const struct iovec *iov = iter_iov(i); while (1) { @@ -603,6 +667,9 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i) if (iov_iter_is_bvec(i)) return min(i->count, i->bvec->bv_len - i->iov_offset); } + if (unlikely(iov_iter_is_folioq(i))) + return !i->count ? 0 : + umin(folioq_folio_size(i->folioq, i->folioq_slot), i->count); return i->count; } EXPORT_SYMBOL(iov_iter_single_seg_count); @@ -639,6 +706,36 @@ void iov_iter_bvec(struct iov_iter *i, unsigned int direction, } EXPORT_SYMBOL(iov_iter_bvec); +/** + * iov_iter_folio_queue - Initialise an I/O iterator to use the folios in a folio queue + * @i: The iterator to initialise. + * @direction: The direction of the transfer. + * @folioq: The starting point in the folio queue. + * @first_slot: The first slot in the folio queue to use + * @offset: The offset into the folio in the first slot to start at + * @count: The size of the I/O buffer in bytes. + * + * Set up an I/O iterator to either draw data out of the pages attached to an + * inode or to inject data into those pages. The pages *must* be prevented + * from evaporation, either by taking a ref on them or locking them by the + * caller. + */ +void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction, + const struct folio_queue *folioq, unsigned int first_slot, + unsigned int offset, size_t count) +{ + BUG_ON(direction & ~1); + *i = (struct iov_iter) { + .iter_type = ITER_FOLIOQ, + .data_source = direction, + .folioq = folioq, + .folioq_slot = first_slot, + .count = count, + .iov_offset = offset, + }; +} +EXPORT_SYMBOL(iov_iter_folio_queue); + /** * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray * @i: The iterator to initialise. @@ -765,12 +862,19 @@ bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, if (iov_iter_is_bvec(i)) return iov_iter_aligned_bvec(i, addr_mask, len_mask); + /* With both xarray and folioq types, we're dealing with whole folios. */ if (iov_iter_is_xarray(i)) { if (i->count & len_mask) return false; if ((i->xarray_start + i->iov_offset) & addr_mask) return false; } + if (iov_iter_is_folioq(i)) { + if (i->count & len_mask) + return false; + if (i->iov_offset & addr_mask) + return false; + } return true; } @@ -835,6 +939,9 @@ unsigned long iov_iter_alignment(const struct iov_iter *i) if (iov_iter_is_bvec(i)) return iov_iter_alignment_bvec(i); + /* With both xarray and folioq types, we're dealing with whole folios. */ + if (iov_iter_is_folioq(i)) + return i->iov_offset | i->count; if (iov_iter_is_xarray(i)) return (i->xarray_start + i->iov_offset) | i->count; @@ -887,6 +994,62 @@ static int want_pages_array(struct page ***res, size_t size, return count; } +static ssize_t iter_folioq_get_pages(struct iov_iter *iter, + struct page ***ppages, size_t maxsize, + unsigned maxpages, size_t *_start_offset) +{ + const struct folio_queue *folioq = iter->folioq; + struct page **pages; + unsigned int slot = iter->folioq_slot; + size_t extracted = 0, count = iter->count, iov_offset = iter->iov_offset; + + if (slot >= folioq_nr_slots(folioq)) { + folioq = folioq->next; + slot = 0; + if (WARN_ON(iov_offset != 0)) + return -EIO; + } + + maxpages = want_pages_array(ppages, maxsize, iov_offset & ~PAGE_MASK, maxpages); + if (!maxpages) + return -ENOMEM; + *_start_offset = iov_offset & ~PAGE_MASK; + pages = *ppages; + + for (;;) { + struct folio *folio = folioq_folio(folioq, slot); + size_t offset = iov_offset, fsize = folioq_folio_size(folioq, slot); + size_t part = PAGE_SIZE - offset % PAGE_SIZE; + + part = umin(part, umin(maxsize - extracted, fsize - offset)); + count -= part; + iov_offset += part; + extracted += part; + + *pages = folio_page(folio, offset / PAGE_SIZE); + get_page(*pages); + pages++; + maxpages--; + if (maxpages == 0 || extracted >= maxsize) + break; + + if (offset >= fsize) { + iov_offset = 0; + slot++; + if (slot == folioq_nr_slots(folioq) && folioq->next) { + folioq = folioq->next; + slot = 0; + } + } + } + + iter->count = count; + iter->iov_offset = iov_offset; + iter->folioq = folioq; + iter->folioq_slot = slot; + return extracted; +} + static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa, pgoff_t index, unsigned int nr_pages) { @@ -1034,6 +1197,8 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, } return maxsize; } + if (iov_iter_is_folioq(i)) + return iter_folioq_get_pages(i, pages, maxsize, maxpages, start); if (iov_iter_is_xarray(i)) return iter_xarray_get_pages(i, pages, maxsize, maxpages, start); return -EFAULT; @@ -1118,6 +1283,11 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages) return iov_npages(i, maxpages); if (iov_iter_is_bvec(i)) return bvec_npages(i, maxpages); + if (iov_iter_is_folioq(i)) { + unsigned offset = i->iov_offset % PAGE_SIZE; + int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); + return min(npages, maxpages); + } if (iov_iter_is_xarray(i)) { unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE; int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); @@ -1398,6 +1568,68 @@ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) i->nr_segs = state->nr_segs; } +/* + * Extract a list of contiguous pages from an ITER_FOLIOQ iterator. This does + * not get references on the pages, nor does it get a pin on them. + */ +static ssize_t iov_iter_extract_folioq_pages(struct iov_iter *i, + struct page ***pages, size_t maxsize, + unsigned int maxpages, + iov_iter_extraction_t extraction_flags, + size_t *offset0) +{ + const struct folio_queue *folioq = i->folioq; + struct page **p; + unsigned int nr = 0; + size_t extracted = 0, offset, slot = i->folioq_slot; + + if (slot >= folioq_nr_slots(folioq)) { + folioq = folioq->next; + slot = 0; + if (WARN_ON(i->iov_offset != 0)) + return -EIO; + } + + offset = i->iov_offset & ~PAGE_MASK; + *offset0 = offset; + + maxpages = want_pages_array(pages, maxsize, offset, maxpages); + if (!maxpages) + return -ENOMEM; + p = *pages; + + for (;;) { + struct folio *folio = folioq_folio(folioq, slot); + size_t offset = i->iov_offset, fsize = folioq_folio_size(folioq, slot); + size_t part = PAGE_SIZE - offset % PAGE_SIZE; + + if (offset < fsize) { + part = umin(part, umin(maxsize - extracted, fsize - offset)); + i->count -= part; + i->iov_offset += part; + extracted += part; + + p[nr++] = folio_page(folio, offset / PAGE_SIZE); + } + + if (nr >= maxpages || extracted >= maxsize) + break; + + if (i->iov_offset >= fsize) { + i->iov_offset = 0; + slot++; + if (slot == folioq_nr_slots(folioq) && folioq->next) { + folioq = folioq->next; + slot = 0; + } + } + } + + i->folioq = folioq; + i->folioq_slot = slot; + return extracted; +} + /* * Extract a list of contiguous pages from an ITER_XARRAY iterator. This does not * get references on the pages, nor does it get a pin on them. @@ -1618,8 +1850,8 @@ static ssize_t iov_iter_extract_user_pages(struct iov_iter *i, * added to the pages, but refs will not be taken. * iov_iter_extract_will_pin() will return true. * - * (*) If the iterator is ITER_KVEC, ITER_BVEC or ITER_XARRAY, the pages are - * merely listed; no extra refs or pins are obtained. + * (*) If the iterator is ITER_KVEC, ITER_BVEC, ITER_FOLIOQ or ITER_XARRAY, the + * pages are merely listed; no extra refs or pins are obtained. * iov_iter_extract_will_pin() will return 0. * * Note also: @@ -1654,6 +1886,10 @@ ssize_t iov_iter_extract_pages(struct iov_iter *i, return iov_iter_extract_bvec_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); + if (iov_iter_is_folioq(i)) + return iov_iter_extract_folioq_pages(i, pages, maxsize, + maxpages, extraction_flags, + offset0); if (iov_iter_is_xarray(i)) return iov_iter_extract_xarray_pages(i, pages, maxsize, maxpages, extraction_flags, diff --git a/lib/kunit_iov_iter.c b/lib/kunit_iov_iter.c index 27e0c8ee71d8..13e15687675a 100644 --- a/lib/kunit_iov_iter.c +++ b/lib/kunit_iov_iter.c @@ -12,6 +12,7 @@ #include #include #include +#include #include MODULE_DESCRIPTION("iov_iter testing"); @@ -62,6 +63,9 @@ static void *__init iov_kunit_create_buffer(struct kunit *test, KUNIT_ASSERT_EQ(test, got, npages); } + for (int i = 0; i < npages; i++) + pages[i]->index = i; + buffer = vmap(pages, npages, VM_MAP | VM_MAP_PUT_PAGES, PAGE_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buffer); @@ -362,6 +366,179 @@ stop: KUNIT_SUCCEED(test); } +static void iov_kunit_destroy_folioq(void *data) +{ + struct folio_queue *folioq, *next; + + for (folioq = data; folioq; folioq = next) { + next = folioq->next; + for (int i = 0; i < folioq_nr_slots(folioq); i++) + if (folioq_folio(folioq, i)) + folio_put(folioq_folio(folioq, i)); + kfree(folioq); + } +} + +static void __init iov_kunit_load_folioq(struct kunit *test, + struct iov_iter *iter, int dir, + struct folio_queue *folioq, + struct page **pages, size_t npages) +{ + struct folio_queue *p = folioq; + size_t size = 0; + int i; + + for (i = 0; i < npages; i++) { + if (folioq_full(p)) { + p->next = kzalloc(sizeof(struct folio_queue), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p->next); + folioq_init(p->next); + p->next->prev = p; + p = p->next; + } + folioq_append(p, page_folio(pages[i])); + size += PAGE_SIZE; + } + iov_iter_folio_queue(iter, dir, folioq, 0, 0, size); +} + +static struct folio_queue *iov_kunit_create_folioq(struct kunit *test) +{ + struct folio_queue *folioq; + + folioq = kzalloc(sizeof(struct folio_queue), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, folioq); + kunit_add_action_or_reset(test, iov_kunit_destroy_folioq, folioq); + folioq_init(folioq); + return folioq; +} + +/* + * Test copying to a ITER_FOLIOQ-type iterator. + */ +static void __init iov_kunit_copy_to_folioq(struct kunit *test) +{ + const struct kvec_test_range *pr; + struct iov_iter iter; + struct folio_queue *folioq; + struct page **spages, **bpages; + u8 *scratch, *buffer; + size_t bufsize, npages, size, copied; + int i, patt; + + bufsize = 0x100000; + npages = bufsize / PAGE_SIZE; + + folioq = iov_kunit_create_folioq(test); + + scratch = iov_kunit_create_buffer(test, &spages, npages); + for (i = 0; i < bufsize; i++) + scratch[i] = pattern(i); + + buffer = iov_kunit_create_buffer(test, &bpages, npages); + memset(buffer, 0, bufsize); + + iov_kunit_load_folioq(test, &iter, READ, folioq, bpages, npages); + + i = 0; + for (pr = kvec_test_ranges; pr->from >= 0; pr++) { + size = pr->to - pr->from; + KUNIT_ASSERT_LE(test, pr->to, bufsize); + + iov_iter_folio_queue(&iter, READ, folioq, 0, 0, pr->to); + iov_iter_advance(&iter, pr->from); + copied = copy_to_iter(scratch + i, size, &iter); + + KUNIT_EXPECT_EQ(test, copied, size); + KUNIT_EXPECT_EQ(test, iter.count, 0); + KUNIT_EXPECT_EQ(test, iter.iov_offset, pr->to % PAGE_SIZE); + i += size; + if (test->status == KUNIT_FAILURE) + goto stop; + } + + /* Build the expected image in the scratch buffer. */ + patt = 0; + memset(scratch, 0, bufsize); + for (pr = kvec_test_ranges; pr->from >= 0; pr++) + for (i = pr->from; i < pr->to; i++) + scratch[i] = pattern(patt++); + + /* Compare the images */ + for (i = 0; i < bufsize; i++) { + KUNIT_EXPECT_EQ_MSG(test, buffer[i], scratch[i], "at i=%x", i); + if (buffer[i] != scratch[i]) + return; + } + +stop: + KUNIT_SUCCEED(test); +} + +/* + * Test copying from a ITER_FOLIOQ-type iterator. + */ +static void __init iov_kunit_copy_from_folioq(struct kunit *test) +{ + const struct kvec_test_range *pr; + struct iov_iter iter; + struct folio_queue *folioq; + struct page **spages, **bpages; + u8 *scratch, *buffer; + size_t bufsize, npages, size, copied; + int i, j; + + bufsize = 0x100000; + npages = bufsize / PAGE_SIZE; + + folioq = iov_kunit_create_folioq(test); + + buffer = iov_kunit_create_buffer(test, &bpages, npages); + for (i = 0; i < bufsize; i++) + buffer[i] = pattern(i); + + scratch = iov_kunit_create_buffer(test, &spages, npages); + memset(scratch, 0, bufsize); + + iov_kunit_load_folioq(test, &iter, READ, folioq, bpages, npages); + + i = 0; + for (pr = kvec_test_ranges; pr->from >= 0; pr++) { + size = pr->to - pr->from; + KUNIT_ASSERT_LE(test, pr->to, bufsize); + + iov_iter_folio_queue(&iter, WRITE, folioq, 0, 0, pr->to); + iov_iter_advance(&iter, pr->from); + copied = copy_from_iter(scratch + i, size, &iter); + + KUNIT_EXPECT_EQ(test, copied, size); + KUNIT_EXPECT_EQ(test, iter.count, 0); + KUNIT_EXPECT_EQ(test, iter.iov_offset, pr->to % PAGE_SIZE); + i += size; + } + + /* Build the expected image in the main buffer. */ + i = 0; + memset(buffer, 0, bufsize); + for (pr = kvec_test_ranges; pr->from >= 0; pr++) { + for (j = pr->from; j < pr->to; j++) { + buffer[i++] = pattern(j); + if (i >= bufsize) + goto stop; + } + } +stop: + + /* Compare the images */ + for (i = 0; i < bufsize; i++) { + KUNIT_EXPECT_EQ_MSG(test, scratch[i], buffer[i], "at i=%x", i); + if (scratch[i] != buffer[i]) + return; + } + + KUNIT_SUCCEED(test); +} + static void iov_kunit_destroy_xarray(void *data) { struct xarray *xarray = data; @@ -677,6 +854,85 @@ stop: KUNIT_SUCCEED(test); } +/* + * Test the extraction of ITER_FOLIOQ-type iterators. + */ +static void __init iov_kunit_extract_pages_folioq(struct kunit *test) +{ + const struct kvec_test_range *pr; + struct folio_queue *folioq; + struct iov_iter iter; + struct page **bpages, *pagelist[8], **pages = pagelist; + ssize_t len; + size_t bufsize, size = 0, npages; + int i, from; + + bufsize = 0x100000; + npages = bufsize / PAGE_SIZE; + + folioq = iov_kunit_create_folioq(test); + + iov_kunit_create_buffer(test, &bpages, npages); + iov_kunit_load_folioq(test, &iter, READ, folioq, bpages, npages); + + for (pr = kvec_test_ranges; pr->from >= 0; pr++) { + from = pr->from; + size = pr->to - from; + KUNIT_ASSERT_LE(test, pr->to, bufsize); + + iov_iter_folio_queue(&iter, WRITE, folioq, 0, 0, pr->to); + iov_iter_advance(&iter, from); + + do { + size_t offset0 = LONG_MAX; + + for (i = 0; i < ARRAY_SIZE(pagelist); i++) + pagelist[i] = (void *)(unsigned long)0xaa55aa55aa55aa55ULL; + + len = iov_iter_extract_pages(&iter, &pages, 100 * 1024, + ARRAY_SIZE(pagelist), 0, &offset0); + KUNIT_EXPECT_GE(test, len, 0); + if (len < 0) + break; + KUNIT_EXPECT_LE(test, len, size); + KUNIT_EXPECT_EQ(test, iter.count, size - len); + if (len == 0) + break; + size -= len; + KUNIT_EXPECT_GE(test, (ssize_t)offset0, 0); + KUNIT_EXPECT_LT(test, offset0, PAGE_SIZE); + + for (i = 0; i < ARRAY_SIZE(pagelist); i++) { + struct page *p; + ssize_t part = min_t(ssize_t, len, PAGE_SIZE - offset0); + int ix; + + KUNIT_ASSERT_GE(test, part, 0); + ix = from / PAGE_SIZE; + KUNIT_ASSERT_LT(test, ix, npages); + p = bpages[ix]; + KUNIT_EXPECT_PTR_EQ(test, pagelist[i], p); + KUNIT_EXPECT_EQ(test, offset0, from % PAGE_SIZE); + from += part; + len -= part; + KUNIT_ASSERT_GE(test, len, 0); + if (len == 0) + break; + offset0 = 0; + } + + if (test->status == KUNIT_FAILURE) + goto stop; + } while (iov_iter_count(&iter) > 0); + + KUNIT_EXPECT_EQ(test, size, 0); + KUNIT_EXPECT_EQ(test, iter.count, 0); + } + +stop: + KUNIT_SUCCEED(test); +} + /* * Test the extraction of ITER_XARRAY-type iterators. */ @@ -761,10 +1017,13 @@ static struct kunit_case __refdata iov_kunit_cases[] = { KUNIT_CASE(iov_kunit_copy_from_kvec), KUNIT_CASE(iov_kunit_copy_to_bvec), KUNIT_CASE(iov_kunit_copy_from_bvec), + KUNIT_CASE(iov_kunit_copy_to_folioq), + KUNIT_CASE(iov_kunit_copy_from_folioq), KUNIT_CASE(iov_kunit_copy_to_xarray), KUNIT_CASE(iov_kunit_copy_from_xarray), KUNIT_CASE(iov_kunit_extract_pages_kvec), KUNIT_CASE(iov_kunit_extract_pages_bvec), + KUNIT_CASE(iov_kunit_extract_pages_folioq), KUNIT_CASE(iov_kunit_extract_pages_xarray), {} }; diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 7bc2220fea80..473b2646f71c 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -11,6 +11,7 @@ #include #include #include +#include /** * sg_next - return the next scatterlist entry in a list @@ -1261,6 +1262,67 @@ static ssize_t extract_kvec_to_sg(struct iov_iter *iter, return ret; } +/* + * Extract up to sg_max folios from an FOLIOQ-type iterator and add them to + * the scatterlist. The pages are not pinned. + */ +static ssize_t extract_folioq_to_sg(struct iov_iter *iter, + ssize_t maxsize, + struct sg_table *sgtable, + unsigned int sg_max, + iov_iter_extraction_t extraction_flags) +{ + const struct folio_queue *folioq = iter->folioq; + struct scatterlist *sg = sgtable->sgl + sgtable->nents; + unsigned int slot = iter->folioq_slot; + ssize_t ret = 0; + size_t offset = iter->iov_offset; + + BUG_ON(!folioq); + + if (slot >= folioq_nr_slots(folioq)) { + folioq = folioq->next; + if (WARN_ON_ONCE(!folioq)) + return 0; + slot = 0; + } + + do { + struct folio *folio = folioq_folio(folioq, slot); + size_t fsize = folioq_folio_size(folioq, slot); + + if (offset < fsize) { + size_t part = umin(maxsize - ret, fsize - offset); + + sg_set_page(sg, folio_page(folio, 0), part, offset); + sgtable->nents++; + sg++; + sg_max--; + offset += part; + ret += part; + } + + if (offset >= fsize) { + offset = 0; + slot++; + if (slot >= folioq_nr_slots(folioq)) { + if (!folioq->next) { + WARN_ON_ONCE(ret < iter->count); + break; + } + folioq = folioq->next; + slot = 0; + } + } + } while (sg_max > 0 && ret < maxsize); + + iter->folioq = folioq; + iter->folioq_slot = slot; + iter->iov_offset = offset; + iter->count -= ret; + return ret; +} + /* * Extract up to sg_max folios from an XARRAY-type iterator and add them to * the scatterlist. The pages are not pinned. @@ -1323,8 +1385,8 @@ static ssize_t extract_xarray_to_sg(struct iov_iter *iter, * addition of @sg_max elements. * * The pages referred to by UBUF- and IOVEC-type iterators are extracted and - * pinned; BVEC-, KVEC- and XARRAY-type are extracted but aren't pinned; PIPE- - * and DISCARD-type are not supported. + * pinned; BVEC-, KVEC-, FOLIOQ- and XARRAY-type are extracted but aren't + * pinned; DISCARD-type is not supported. * * No end mark is placed on the scatterlist; that's left to the caller. * @@ -1356,6 +1418,9 @@ ssize_t extract_iter_to_sg(struct iov_iter *iter, size_t maxsize, case ITER_KVEC: return extract_kvec_to_sg(iter, maxsize, sgtable, sg_max, extraction_flags); + case ITER_FOLIOQ: + return extract_folioq_to_sg(iter, maxsize, sgtable, sg_max, + extraction_flags); case ITER_XARRAY: return extract_xarray_to_sg(iter, maxsize, sgtable, sg_max, extraction_flags);