From d0364f9490d7e098963ce4d146b51f9cd1199412 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 2 Aug 2021 14:43:43 -0700 Subject: [PATCH 01/41] iomap: simplify iomap_readpage_actor Now that the outstanding reads are counted in bytes, there is no need to use the low-level __bio_try_merge_page API, we can switch back to always using bio_add_page and simplify iomap_readpage_actor again. Signed-off-by: Christoph Hellwig Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 87ccb3438bec..712b6513a0c4 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -241,7 +241,6 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, struct iomap_readpage_ctx *ctx = data; struct page *page = ctx->cur_page; struct iomap_page *iop; - bool same_page = false, is_contig = false; loff_t orig_pos = pos; unsigned poff, plen; sector_t sector; @@ -268,16 +267,10 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (iop) atomic_add(plen, &iop->read_bytes_pending); - /* Try to merge into a previous segment if we can */ sector = iomap_sector(iomap, pos); - if (ctx->bio && bio_end_sector(ctx->bio) == sector) { - if (__bio_try_merge_page(ctx->bio, page, plen, poff, - &same_page)) - goto done; - is_contig = true; - } - - if (!is_contig || bio_full(ctx->bio, plen)) { + if (!ctx->bio || + bio_end_sector(ctx->bio) != sector || + bio_add_page(ctx->bio, page, plen, poff) != plen) { gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); gfp_t orig_gfp = gfp; unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE); @@ -301,9 +294,8 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, ctx->bio->bi_iter.bi_sector = sector; bio_set_dev(ctx->bio, iomap->bdev); ctx->bio->bi_end_io = iomap_read_end_io; + __bio_add_page(ctx->bio, page, plen, poff); } - - bio_add_page(ctx->bio, page, plen, poff); done: /* * Move the caller beyond our range so that it keeps making progress. From c1b79f11f4ec27d3b3197a9584950a3be178c717 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 2 Aug 2021 14:43:43 -0700 Subject: [PATCH 02/41] iomap: simplify iomap_add_to_ioend Now that the outstanding writes are counted in bytes, there is no need to use the low-level __bio_try_merge_page API, we can switch back to always using bio_add_page and simply iomap_add_to_ioend again. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 712b6513a0c4..a463b41c0a16 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1251,7 +1251,6 @@ iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page, sector_t sector = iomap_sector(&wpc->iomap, offset); unsigned len = i_blocksize(inode); unsigned poff = offset & (PAGE_SIZE - 1); - bool merged, same_page = false; if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, offset, sector)) { if (wpc->ioend) @@ -1259,19 +1258,13 @@ iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page, wpc->ioend = iomap_alloc_ioend(inode, wpc, offset, sector, wbc); } - merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, - &same_page); - if (iop) - atomic_add(len, &iop->write_bytes_pending); - - if (!merged) { - if (bio_full(wpc->ioend->io_bio, len)) { - wpc->ioend->io_bio = - iomap_chain_bio(wpc->ioend->io_bio); - } - bio_add_page(wpc->ioend->io_bio, page, len, poff); + if (bio_add_page(wpc->ioend->io_bio, page, len, poff) != len) { + wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio); + __bio_add_page(wpc->ioend->io_bio, page, len, poff); } + if (iop) + atomic_add(len, &iop->write_bytes_pending); wpc->ioend->io_size += len; wbc_account_cgroup_owner(wbc, page, len); } From 69f4a26c1e0c7c5e5e77c5bd7b271743c124c545 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 3 Aug 2021 09:38:22 -0700 Subject: [PATCH 03/41] iomap: support reading inline data from non-zero pos The existing inline data support only works for cases where the entire file is stored as inline data. For larger files, EROFS stores the initial blocks separately and the remainder of the file ("file tail") adjacent to the inode. Generalise inline data to allow reading the inline file tail. Tails may not cross a page boundary in memory. We currently have no filesystems that support tails and writing, so that case is currently disabled (see iomap_write_begin_inline). Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andreas Gruenbacher Signed-off-by: Gao Xiang Signed-off-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 42 ++++++++++++++++++++++++++++++------------ fs/iomap/direct-io.c | 10 ++++++---- include/linux/iomap.h | 18 ++++++++++++++++++ 3 files changed, 54 insertions(+), 16 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index a463b41c0a16..1d31ff6bfea0 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -205,25 +205,32 @@ struct iomap_readpage_ctx { struct readahead_control *rac; }; -static void -iomap_read_inline_data(struct inode *inode, struct page *page, +static int iomap_read_inline_data(struct inode *inode, struct page *page, struct iomap *iomap) { - size_t size = i_size_read(inode); + size_t size = i_size_read(inode) - iomap->offset; void *addr; if (PageUptodate(page)) - return; + return 0; - BUG_ON(page_has_private(page)); - BUG_ON(page->index); - BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data)); + /* inline data must start page aligned in the file */ + if (WARN_ON_ONCE(offset_in_page(iomap->offset))) + return -EIO; + if (WARN_ON_ONCE(size > PAGE_SIZE - + offset_in_page(iomap->inline_data))) + return -EIO; + if (WARN_ON_ONCE(size > iomap->length)) + return -EIO; + if (WARN_ON_ONCE(page_has_private(page))) + return -EIO; addr = kmap_atomic(page); memcpy(addr, iomap->inline_data, size); memset(addr + size, 0, PAGE_SIZE - size); kunmap_atomic(addr); SetPageUptodate(page); + return 0; } static inline bool iomap_block_needs_zeroing(struct inode *inode, @@ -246,8 +253,10 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, sector_t sector; if (iomap->type == IOMAP_INLINE) { - WARN_ON_ONCE(pos); - iomap_read_inline_data(inode, page, iomap); + int ret = iomap_read_inline_data(inode, page, iomap); + + if (ret) + return ret; return PAGE_SIZE; } @@ -581,6 +590,15 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, return 0; } +static int iomap_write_begin_inline(struct inode *inode, + struct page *page, struct iomap *srcmap) +{ + /* needs more work for the tailpacking case; disable for now */ + if (WARN_ON_ONCE(srcmap->offset != 0)) + return -EIO; + return iomap_read_inline_data(inode, page, srcmap); +} + static int iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, struct page **pagep, struct iomap *iomap, struct iomap *srcmap) @@ -610,7 +628,7 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, } if (srcmap->type == IOMAP_INLINE) - iomap_read_inline_data(inode, page, srcmap); + status = iomap_write_begin_inline(inode, page, srcmap); else if (iomap->flags & IOMAP_F_BUFFER_HEAD) status = __block_write_begin_int(page, pos, len, NULL, srcmap); else @@ -663,11 +681,11 @@ static size_t iomap_write_end_inline(struct inode *inode, struct page *page, void *addr; WARN_ON_ONCE(!PageUptodate(page)); - BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data)); + BUG_ON(!iomap_inline_data_valid(iomap)); flush_dcache_page(page); addr = kmap_atomic(page); - memcpy(iomap->inline_data + pos, addr + pos, copied); + memcpy(iomap_inline_data(iomap, pos), addr + pos, copied); kunmap_atomic(addr); mark_inode_dirty(inode); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 9398b8c31323..41ccbfc9dc82 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -378,23 +378,25 @@ iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length, struct iomap_dio *dio, struct iomap *iomap) { struct iov_iter *iter = dio->submit.iter; + void *inline_data = iomap_inline_data(iomap, pos); size_t copied; - BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data)); + if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap))) + return -EIO; if (dio->flags & IOMAP_DIO_WRITE) { loff_t size = inode->i_size; if (pos > size) - memset(iomap->inline_data + size, 0, pos - size); - copied = copy_from_iter(iomap->inline_data + pos, length, iter); + memset(iomap_inline_data(iomap, size), 0, pos - size); + copied = copy_from_iter(inline_data, length, iter); if (copied) { if (pos + copied > size) i_size_write(inode, pos + copied); mark_inode_dirty(inode); } } else { - copied = copy_to_iter(iomap->inline_data + pos, length, iter); + copied = copy_to_iter(inline_data, length, iter); } dio->size += copied; return copied; diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 479c1da3e221..b8ec145b2975 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -97,6 +97,24 @@ iomap_sector(struct iomap *iomap, loff_t pos) return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT; } +/* + * Returns the inline data pointer for logical offset @pos. + */ +static inline void *iomap_inline_data(struct iomap *iomap, loff_t pos) +{ + return iomap->inline_data + pos - iomap->offset; +} + +/* + * Check if the mapping's length is within the valid range for inline data. + * This is used to guard against accessing data beyond the page inline_data + * points at. + */ +static inline bool iomap_inline_data_valid(struct iomap *iomap) +{ + return iomap->length <= PAGE_SIZE - offset_in_page(iomap->inline_data); +} + /* * When a filesystem sets page_ops in an iomap mapping it returns, page_prepare * and page_done will be called for each page written to. This only applies to From b405435b419cb660455ba54fd47086216e58fed6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 2 Aug 2021 14:45:57 -0700 Subject: [PATCH 04/41] iomap: Support inline data with block size < page size Remove the restriction that inline data must start on a page boundary in a file. This allows, for example, the first 2KiB to be stored out of line and the trailing 30 bytes to be stored inline. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 1d31ff6bfea0..28cfa7fab023 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -209,28 +209,26 @@ static int iomap_read_inline_data(struct inode *inode, struct page *page, struct iomap *iomap) { size_t size = i_size_read(inode) - iomap->offset; + size_t poff = offset_in_page(iomap->offset); void *addr; if (PageUptodate(page)) - return 0; + return PAGE_SIZE - poff; - /* inline data must start page aligned in the file */ - if (WARN_ON_ONCE(offset_in_page(iomap->offset))) - return -EIO; if (WARN_ON_ONCE(size > PAGE_SIZE - offset_in_page(iomap->inline_data))) return -EIO; if (WARN_ON_ONCE(size > iomap->length)) return -EIO; - if (WARN_ON_ONCE(page_has_private(page))) - return -EIO; + if (poff > 0) + iomap_page_create(inode, page); - addr = kmap_atomic(page); + addr = kmap_atomic(page) + poff; memcpy(addr, iomap->inline_data, size); - memset(addr + size, 0, PAGE_SIZE - size); + memset(addr + size, 0, PAGE_SIZE - poff - size); kunmap_atomic(addr); - SetPageUptodate(page); - return 0; + iomap_set_range_uptodate(page, poff, PAGE_SIZE - poff); + return PAGE_SIZE - poff; } static inline bool iomap_block_needs_zeroing(struct inode *inode, @@ -252,13 +250,8 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, unsigned poff, plen; sector_t sector; - if (iomap->type == IOMAP_INLINE) { - int ret = iomap_read_inline_data(inode, page, iomap); - - if (ret) - return ret; - return PAGE_SIZE; - } + if (iomap->type == IOMAP_INLINE) + return iomap_read_inline_data(inode, page, iomap); /* zero post-eof blocks as the page may be mapped */ iop = iomap_page_create(inode, page); @@ -593,10 +586,15 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, static int iomap_write_begin_inline(struct inode *inode, struct page *page, struct iomap *srcmap) { + int ret; + /* needs more work for the tailpacking case; disable for now */ if (WARN_ON_ONCE(srcmap->offset != 0)) return -EIO; - return iomap_read_inline_data(inode, page, srcmap); + ret = iomap_read_inline_data(inode, page, srcmap); + if (ret < 0) + return ret; + return 0; } static int From f1f264b4c134ee65cdadece7a20f3c0643602a4a Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 2 Aug 2021 14:46:31 -0700 Subject: [PATCH 05/41] iomap: Fix some typos and bad grammar Fix some typos and bad grammar in buffered-io.c to make the comments easier to read. Signed-off-by: Andreas Gruenbacher Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 72 +++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 28cfa7fab023..c1c8cd41ea81 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -36,7 +36,7 @@ static inline struct iomap_page *to_iomap_page(struct page *page) { /* * per-block data is stored in the head page. Callers should - * not be dealing with tail pages (and if they are, they can + * not be dealing with tail pages, and if they are, they can * call thp_head() first. */ VM_BUG_ON_PGFLAGS(PageTail(page), page); @@ -98,7 +98,7 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, unsigned last = (poff + plen - 1) >> block_bits; /* - * If the block size is smaller than the page size we need to check the + * If the block size is smaller than the page size, we need to check the * per-block uptodate status and adjust the offset and length if needed * to avoid reading in already uptodate ranges. */ @@ -126,7 +126,7 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, } /* - * If the extent spans the block that contains the i_size we need to + * If the extent spans the block that contains the i_size, we need to * handle both halves separately so that we properly zero data in the * page cache for blocks that are entirely outside of i_size. */ @@ -301,7 +301,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, done: /* * Move the caller beyond our range so that it keeps making progress. - * For that we have to include any leading non-uptodate ranges, but + * For that, we have to include any leading non-uptodate ranges, but * we can skip trailing ones as they will be handled in the next * iteration. */ @@ -338,9 +338,9 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops) } /* - * Just like mpage_readahead and block_read_full_page we always + * Just like mpage_readahead and block_read_full_page, we always * return 0 and just mark the page as PageError on errors. This - * should be cleaned up all through the stack eventually. + * should be cleaned up throughout the stack eventually. */ return 0; } @@ -461,7 +461,7 @@ iomap_releasepage(struct page *page, gfp_t gfp_mask) /* * mm accommodates an old ext3 case where clean pages might not have had * the dirty bit cleared. Thus, it can send actual dirty pages to - * ->releasepage() via shrink_active_list(), skip those here. + * ->releasepage() via shrink_active_list(); skip those here. */ if (PageDirty(page) || PageWriteback(page)) return 0; @@ -476,7 +476,7 @@ iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len) trace_iomap_invalidatepage(page->mapping->host, offset, len); /* - * If we are invalidating the entire page, clear the dirty state from it + * If we're invalidating the entire page, clear the dirty state from it * and release it to avoid unnecessary buildup of the LRU. */ if (offset == 0 && len == PAGE_SIZE) { @@ -658,13 +658,13 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, /* * The blocks that were entirely written will now be uptodate, so we * don't have to worry about a readpage reading them and overwriting a - * partial write. However if we have encountered a short write and only + * partial write. However, if we've encountered a short write and only * partially written into a block, it will not be marked uptodate, so a * readpage might come in and destroy our partial write. * - * Do the simplest thing, and just treat any short write to a non - * uptodate page as a zero-length write, and force the caller to redo - * the whole thing. + * Do the simplest thing and just treat any short write to a + * non-uptodate page as a zero-length write, and force the caller to + * redo the whole thing. */ if (unlikely(copied < len && !PageUptodate(page))) return 0; @@ -752,7 +752,7 @@ again: bytes = length; /* - * Bring in the user page that we will copy from _first_. + * Bring in the user page that we'll copy from _first_. * Otherwise there's a nasty deadlock on copying from the * same page as we're writing to, without it being marked * up-to-date. @@ -1161,7 +1161,7 @@ static void iomap_writepage_end_bio(struct bio *bio) * Submit the final bio for an ioend. * * If @error is non-zero, it means that we have a situation where some part of - * the submission process has failed after we have marked paged for writeback + * the submission process has failed after we've marked pages for writeback * and unlocked them. In this situation, we need to fail the bio instead of * submitting it. This typically only happens on a filesystem shutdown. */ @@ -1176,7 +1176,7 @@ iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend, error = wpc->ops->prepare_ioend(ioend, error); if (error) { /* - * If we are failing the IO now, just mark the ioend with an + * If we're failing the IO now, just mark the ioend with an * error and finish it. This will run IO completion immediately * as there is only one reference to the ioend at this point in * time. @@ -1218,7 +1218,7 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, /* * Allocate a new bio, and chain the old bio to the new one. * - * Note that we have to do perform the chaining in this unintuitive order + * Note that we have to perform the chaining in this unintuitive order * so that the bi_private linkage is set up in the right direction for the * traversal in iomap_finish_ioend(). */ @@ -1257,7 +1257,7 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset, /* * Test to see if we have an existing ioend structure that we could append to - * first, otherwise finish off the current ioend and start another. + * first; otherwise finish off the current ioend and start another. */ static void iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page, @@ -1288,9 +1288,9 @@ iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page, /* * We implement an immediate ioend submission policy here to avoid needing to * chain multiple ioends and hence nest mempool allocations which can violate - * forward progress guarantees we need to provide. The current ioend we are - * adding blocks to is cached on the writepage context, and if the new block - * does not append to the cached ioend it will create a new ioend and cache that + * the forward progress guarantees we need to provide. The current ioend we're + * adding blocks to is cached in the writepage context, and if the new block + * doesn't append to the cached ioend, it will create a new ioend and cache that * instead. * * If a new ioend is created and cached, the old ioend is returned and queued @@ -1352,7 +1352,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, if (unlikely(error)) { /* * Let the filesystem know what portion of the current page - * failed to map. If the page wasn't been added to ioend, it + * failed to map. If the page hasn't been added to ioend, it * won't be affected by I/O completion and we must unlock it * now. */ @@ -1369,7 +1369,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, unlock_page(page); /* - * Preserve the original error if there was one, otherwise catch + * Preserve the original error if there was one; catch * submission errors here and propagate into subsequent ioend * submissions. */ @@ -1396,8 +1396,8 @@ done: /* * Write out a dirty page. * - * For delalloc space on the page we need to allocate space and flush it. - * For unwritten space on the page we need to start the conversion to + * For delalloc space on the page, we need to allocate space and flush it. + * For unwritten space on the page, we need to start the conversion to * regular allocated space. */ static int @@ -1412,7 +1412,7 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) trace_iomap_writepage(inode, page_offset(page), PAGE_SIZE); /* - * Refuse to write the page out if we are called from reclaim context. + * Refuse to write the page out if we're called from reclaim context. * * This avoids stack overflows when called from deeply used stacks in * random callers for direct reclaim or memcg reclaim. We explicitly @@ -1457,20 +1457,20 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) unsigned offset_into_page = offset & (PAGE_SIZE - 1); /* - * Skip the page if it is fully outside i_size, e.g. due to a - * truncate operation that is in progress. We must redirty the + * Skip the page if it's fully outside i_size, e.g. due to a + * truncate operation that's in progress. We must redirty the * page so that reclaim stops reclaiming it. Otherwise * iomap_vm_releasepage() is called on it and gets confused. * - * Note that the end_index is unsigned long, it would overflow - * if the given offset is greater than 16TB on 32-bit system - * and if we do check the page is fully outside i_size or not - * via "if (page->index >= end_index + 1)" as "end_index + 1" - * will be evaluated to 0. Hence this page will be redirtied - * and be written out repeatedly which would result in an - * infinite loop, the user program that perform this operation - * will hang. Instead, we can verify this situation by checking - * if the page to write is totally beyond the i_size or if it's + * Note that the end_index is unsigned long. If the given + * offset is greater than 16TB on a 32-bit system then if we + * checked if the page is fully outside i_size with + * "if (page->index >= end_index + 1)", "end_index + 1" would + * overflow and evaluate to 0. Hence this page would be + * redirtied and written out repeatedly, which would result in + * an infinite loop; the user program performing this operation + * would hang. Instead, we can detect this situation by + * checking if the page is totally beyond i_size or if its * offset is just equal to the EOF. */ if (page->index > end_index || From ab069d5fdcd14530d4223746c8d01f421d4c4057 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Aug 2021 20:07:33 -0700 Subject: [PATCH 06/41] iomap: Use kmap_local_page instead of kmap_atomic kmap_atomic() has the side-effect of disabling pagefaults and preemption. kmap_local_page() does not do this and is preferred. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index c1c8cd41ea81..8ee0211bea86 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -223,10 +223,10 @@ static int iomap_read_inline_data(struct inode *inode, struct page *page, if (poff > 0) iomap_page_create(inode, page); - addr = kmap_atomic(page) + poff; + addr = kmap_local_page(page) + poff; memcpy(addr, iomap->inline_data, size); memset(addr + size, 0, PAGE_SIZE - poff - size); - kunmap_atomic(addr); + kunmap_local(addr); iomap_set_range_uptodate(page, poff, PAGE_SIZE - poff); return PAGE_SIZE - poff; } @@ -682,9 +682,9 @@ static size_t iomap_write_end_inline(struct inode *inode, struct page *page, BUG_ON(!iomap_inline_data_valid(iomap)); flush_dcache_page(page); - addr = kmap_atomic(page); - memcpy(iomap_inline_data(iomap, pos), addr + pos, copied); - kunmap_atomic(addr); + addr = kmap_local_page(page) + pos; + memcpy(iomap_inline_data(iomap, pos), addr, copied); + kunmap_local(addr); mark_inode_dirty(inode); return copied; From ae44f9c286da3fbb3f827076403ea64fa9adfef2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Aug 2021 20:07:34 -0700 Subject: [PATCH 07/41] iomap: Add another assertion to inline data handling Check that the file tail does not cross a page boundary. Requested by Andreas. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 8ee0211bea86..586d9d078ce1 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -215,6 +215,8 @@ static int iomap_read_inline_data(struct inode *inode, struct page *page, if (PageUptodate(page)) return PAGE_SIZE - poff; + if (WARN_ON_ONCE(size > PAGE_SIZE - poff)) + return -EIO; if (WARN_ON_ONCE(size > PAGE_SIZE - offset_in_page(iomap->inline_data))) return -EIO; From b69eea82d37d9ee7cfb3bf05103549dd4ed5ffc3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 10 Aug 2021 18:32:55 -0700 Subject: [PATCH 08/41] iomap: pass writeback errors to the mapping Modern-day mapping_set_error has the ability to squash the usual negative error code into something appropriate for long-term storage in a struct address_space -- ENOSPC becomes AS_ENOSPC, and everything else becomes EIO. iomap squashes /everything/ to EIO, just as XFS did before that, but this doesn't make sense. Fix this by making it so that we can pass ENOSPC to userspace when writeback fails due to space problems. Signed-off-by: Darrick J. Wong Reviewed-by: Matthew Wilcox (Oracle) --- fs/iomap/buffered-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 586d9d078ce1..43b9354bac3a 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1026,7 +1026,7 @@ iomap_finish_page_writeback(struct inode *inode, struct page *page, if (error) { SetPageError(page); - mapping_set_error(inode->i_mapping, -EIO); + mapping_set_error(inode->i_mapping, error); } WARN_ON_ONCE(i_blocks_per_page(inode, page) > 1 && !iop); From d9d381f3ef5b2a4bee3e98d7b9f3b09cf00119c0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 18:33:03 -0700 Subject: [PATCH 09/41] iomap: fix a trivial comment typo in trace.h Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index fdc7ae388476..e9cd5cc0d6ba 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -2,7 +2,7 @@ /* * Copyright (c) 2009-2019 Christoph Hellwig * - * NOTE: none of these tracepoints shall be consider a stable kernel ABI + * NOTE: none of these tracepoints shall be considered a stable kernel ABI * as they can change at any time. */ #undef TRACE_SYSTEM From 1d25d0aecfcd480b1a997a709c1b37e56ddc3c38 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 18:33:03 -0700 Subject: [PATCH 10/41] iomap: remove the iomap arguments to ->page_{prepare,done} These aren't actually used by the only instance implementing the methods. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/gfs2/bmap.c | 5 ++--- fs/iomap/buffered-io.c | 6 +++--- include/linux/iomap.h | 5 ++--- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index ed8b67b21718..5414c2c33580 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1002,7 +1002,7 @@ static void gfs2_write_unlock(struct inode *inode) } static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos, - unsigned len, struct iomap *iomap) + unsigned len) { unsigned int blockmask = i_blocksize(inode) - 1; struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -1013,8 +1013,7 @@ static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos, } static void gfs2_iomap_page_done(struct inode *inode, loff_t pos, - unsigned copied, struct page *page, - struct iomap *iomap) + unsigned copied, struct page *page) { struct gfs2_trans *tr = current->journal_info; struct gfs2_inode *ip = GFS2_I(inode); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 43b9354bac3a..7e794a30806b 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -615,7 +615,7 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, return -EINTR; if (page_ops && page_ops->page_prepare) { - status = page_ops->page_prepare(inode, pos, len, iomap); + status = page_ops->page_prepare(inode, pos, len); if (status) return status; } @@ -648,7 +648,7 @@ out_unlock: out_no_page: if (page_ops && page_ops->page_done) - page_ops->page_done(inode, pos, 0, NULL, iomap); + page_ops->page_done(inode, pos, 0, NULL); return status; } @@ -724,7 +724,7 @@ static size_t iomap_write_end(struct inode *inode, loff_t pos, size_t len, if (old_size < pos) pagecache_isize_extended(inode, old_size, pos); if (page_ops && page_ops->page_done) - page_ops->page_done(inode, pos, ret, page, iomap); + page_ops->page_done(inode, pos, ret, page); put_page(page); if (ret < len) diff --git a/include/linux/iomap.h b/include/linux/iomap.h index b8ec145b2975..72696a55c137 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -126,10 +126,9 @@ static inline bool iomap_inline_data_valid(struct iomap *iomap) * associated page could not be obtained. */ struct iomap_page_ops { - int (*page_prepare)(struct inode *inode, loff_t pos, unsigned len, - struct iomap *iomap); + int (*page_prepare)(struct inode *inode, loff_t pos, unsigned len); void (*page_done)(struct inode *inode, loff_t pos, unsigned copied, - struct page *page, struct iomap *iomap); + struct page *page); }; /* From 66b8165ed4b5a2e7ddb7b9bbf3586b7ccdd86a1c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 18:33:04 -0700 Subject: [PATCH 11/41] iomap: mark the iomap argument to iomap_sector const Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- include/linux/iomap.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 72696a55c137..8030483331d1 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -91,8 +91,7 @@ struct iomap { const struct iomap_page_ops *page_ops; }; -static inline sector_t -iomap_sector(struct iomap *iomap, loff_t pos) +static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos) { return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT; } From 4495c33e4d302b8d3a9eb483c06b2687d27dab9d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 18:33:04 -0700 Subject: [PATCH 12/41] iomap: mark the iomap argument to iomap_inline_data const Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- include/linux/iomap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 8030483331d1..560247130357 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -99,7 +99,7 @@ static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos) /* * Returns the inline data pointer for logical offset @pos. */ -static inline void *iomap_inline_data(struct iomap *iomap, loff_t pos) +static inline void *iomap_inline_data(const struct iomap *iomap, loff_t pos) { return iomap->inline_data + pos - iomap->offset; } From e3c4ffb0c2219e720acdc6072c6ddaccac5cab79 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 18:33:05 -0700 Subject: [PATCH 13/41] iomap: mark the iomap argument to iomap_inline_data_valid const Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- include/linux/iomap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 560247130357..76bfc5d16ef4 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -109,7 +109,7 @@ static inline void *iomap_inline_data(const struct iomap *iomap, loff_t pos) * This is used to guard against accessing data beyond the page inline_data * points at. */ -static inline bool iomap_inline_data_valid(struct iomap *iomap) +static inline bool iomap_inline_data_valid(const struct iomap *iomap) { return iomap->length <= PAGE_SIZE - offset_in_page(iomap->inline_data); } From 6d49cc8545e9e9e9e5a14e75fd044f049bd6077e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 18:33:05 -0700 Subject: [PATCH 14/41] fs: mark the iomap argument to __block_write_begin_int const __block_write_begin_int never modifies the passed in iomap, so mark it const. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/buffer.c | 4 ++-- fs/internal.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 6290c3afdba4..bd6a9e9fbd64 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1912,7 +1912,7 @@ EXPORT_SYMBOL(page_zero_new_buffers); static void iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, - struct iomap *iomap) + const struct iomap *iomap) { loff_t offset = block << inode->i_blkbits; @@ -1966,7 +1966,7 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, } int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, - get_block_t *get_block, struct iomap *iomap) + get_block_t *get_block, const struct iomap *iomap) { unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + len; diff --git a/fs/internal.h b/fs/internal.h index 82e8eb32ff3d..54c2928d39ec 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -48,8 +48,8 @@ static inline int emergency_thaw_bdev(struct super_block *sb) /* * buffer.c */ -extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, - get_block_t *get_block, struct iomap *iomap); +int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, + get_block_t *get_block, const struct iomap *iomap); /* * char_dev.c From 7e4f4b2d689d959b03cb07dfbdb97b9696cb1076 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 18:33:05 -0700 Subject: [PATCH 15/41] fsdax: mark the iomap argument to dax_iomap_sector as const Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/dax.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/dax.c b/fs/dax.c index da41f9363568..4d63040fd71f 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1005,7 +1005,7 @@ int dax_writeback_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); -static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) +static sector_t dax_iomap_sector(const struct iomap *iomap, loff_t pos) { return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9; } From 78c64b00f842ac704d0612553dd124c31b4afceb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 18:33:06 -0700 Subject: [PATCH 16/41] iomap: mark the iomap argument to iomap_read_inline_data const iomap_read_inline_data never modifies the passed in iomap, so mark it const. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 7e794a30806b..b8a1ba3fb957 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -206,7 +206,7 @@ struct iomap_readpage_ctx { }; static int iomap_read_inline_data(struct inode *inode, struct page *page, - struct iomap *iomap) + const struct iomap *iomap) { size_t size = i_size_read(inode) - iomap->offset; size_t poff = offset_in_page(iomap->offset); From 1acd9e9c015b389aa3201a977454efb92e36806c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 18:33:06 -0700 Subject: [PATCH 17/41] iomap: mark the iomap argument to iomap_read_page_sync const iomap_read_page_sync never modifies the passed in iomap, so mark it const. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index b8a1ba3fb957..0273aede8b1d 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -531,7 +531,7 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) static int iomap_read_page_sync(loff_t block_start, struct page *page, unsigned poff, - unsigned plen, struct iomap *iomap) + unsigned plen, const struct iomap *iomap) { struct bio_vec bvec; struct bio bio; From 740499c78408f75c4e76feac848177cb0d0ccf4f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 18:33:07 -0700 Subject: [PATCH 18/41] iomap: fix the iomap_readpage_actor return value for inline data The actor should never return a larger value than the length that was passed in. The current code handles this gracefully, but the opcoming iter model will be more picky. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 0273aede8b1d..8418dffe8acf 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -205,7 +205,7 @@ struct iomap_readpage_ctx { struct readahead_control *rac; }; -static int iomap_read_inline_data(struct inode *inode, struct page *page, +static loff_t iomap_read_inline_data(struct inode *inode, struct page *page, const struct iomap *iomap) { size_t size = i_size_read(inode) - iomap->offset; @@ -253,7 +253,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, sector_t sector; if (iomap->type == IOMAP_INLINE) - return iomap_read_inline_data(inode, page, iomap); + return min(iomap_read_inline_data(inode, page, iomap), length); /* zero post-eof blocks as the page may be mapped */ iop = iomap_page_create(inode, page); From f4b896c213f0752adc828ddc11bd55419ffab248 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 18:33:07 -0700 Subject: [PATCH 19/41] iomap: add the new iomap_iter model The iomap_iter struct provides a convenient way to package up and maintain all the arguments to the various mapping and operation functions. It is operated on using the iomap_iter() function that is called in loop until the whole range has been processed. Compared to the existing iomap_apply() function this avoid an indirect call for each iteration. For now iomap_iter() calls back into the existing ->iomap_begin and ->iomap_end methods, but in the future this could be further optimized to avoid indirect calls entirely. Based on an earlier patch from Matthew Wilcox . Signed-off-by: Christoph Hellwig [djwong: add to apply.c to preserve git history of iomap loop control] Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/iomap/apply.c | 74 ++++++++++++++++++++++++++++++++++++++++++- fs/iomap/trace.h | 37 +++++++++++++++++++++- include/linux/iomap.h | 56 ++++++++++++++++++++++++++++++++ 3 files changed, 165 insertions(+), 2 deletions(-) diff --git a/fs/iomap/apply.c b/fs/iomap/apply.c index 26ab6563181f..e82647aef7ea 100644 --- a/fs/iomap/apply.c +++ b/fs/iomap/apply.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Red Hat, Inc. - * Copyright (c) 2016-2018 Christoph Hellwig. + * Copyright (c) 2016-2021 Christoph Hellwig. */ #include #include @@ -97,3 +97,75 @@ out: return written ? written : ret; } + +static inline int iomap_iter_advance(struct iomap_iter *iter) +{ + /* handle the previous iteration (if any) */ + if (iter->iomap.length) { + if (iter->processed <= 0) + return iter->processed; + if (WARN_ON_ONCE(iter->processed > iomap_length(iter))) + return -EIO; + iter->pos += iter->processed; + iter->len -= iter->processed; + if (!iter->len) + return 0; + } + + /* clear the state for the next iteration */ + iter->processed = 0; + memset(&iter->iomap, 0, sizeof(iter->iomap)); + memset(&iter->srcmap, 0, sizeof(iter->srcmap)); + return 1; +} + +static inline void iomap_iter_done(struct iomap_iter *iter) +{ + WARN_ON_ONCE(iter->iomap.offset > iter->pos); + WARN_ON_ONCE(iter->iomap.length == 0); + WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos); + + trace_iomap_iter_dstmap(iter->inode, &iter->iomap); + if (iter->srcmap.type != IOMAP_HOLE) + trace_iomap_iter_srcmap(iter->inode, &iter->srcmap); +} + +/** + * iomap_iter - iterate over a ranges in a file + * @iter: iteration structue + * @ops: iomap ops provided by the file system + * + * Iterate over filesystem-provided space mappings for the provided file range. + * + * This function handles cleanup of resources acquired for iteration when the + * filesystem indicates there are no more space mappings, which means that this + * function must be called in a loop that continues as long it returns a + * positive value. If 0 or a negative value is returned, the caller must not + * return to the loop body. Within a loop body, there are two ways to break out + * of the loop body: leave @iter.processed unchanged, or set it to a negative + * errno. + */ +int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) +{ + int ret; + + if (iter->iomap.length && ops->iomap_end) { + ret = ops->iomap_end(iter->inode, iter->pos, iomap_length(iter), + iter->processed > 0 ? iter->processed : 0, + iter->flags, &iter->iomap); + if (ret < 0 && !iter->processed) + return ret; + } + + trace_iomap_iter(iter, ops, _RET_IP_); + ret = iomap_iter_advance(iter); + if (ret <= 0) + return ret; + + ret = ops->iomap_begin(iter->inode, iter->pos, iter->len, iter->flags, + &iter->iomap, &iter->srcmap); + if (ret < 0) + return ret; + iomap_iter_done(iter); + return 1; +} diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index e9cd5cc0d6ba..1012d7af6b68 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * Copyright (c) 2009-2019 Christoph Hellwig + * Copyright (c) 2009-2021 Christoph Hellwig * * NOTE: none of these tracepoints shall be considered a stable kernel ABI * as they can change at any time. @@ -140,6 +140,8 @@ DEFINE_EVENT(iomap_class, name, \ TP_ARGS(inode, iomap)) DEFINE_IOMAP_EVENT(iomap_apply_dstmap); DEFINE_IOMAP_EVENT(iomap_apply_srcmap); +DEFINE_IOMAP_EVENT(iomap_iter_dstmap); +DEFINE_IOMAP_EVENT(iomap_iter_srcmap); TRACE_EVENT(iomap_apply, TP_PROTO(struct inode *inode, loff_t pos, loff_t length, @@ -179,6 +181,39 @@ TRACE_EVENT(iomap_apply, __entry->actor) ); +TRACE_EVENT(iomap_iter, + TP_PROTO(struct iomap_iter *iter, const void *ops, + unsigned long caller), + TP_ARGS(iter, ops, caller), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, ino) + __field(loff_t, pos) + __field(loff_t, length) + __field(unsigned int, flags) + __field(const void *, ops) + __field(unsigned long, caller) + ), + TP_fast_assign( + __entry->dev = iter->inode->i_sb->s_dev; + __entry->ino = iter->inode->i_ino; + __entry->pos = iter->pos; + __entry->length = iomap_length(iter); + __entry->flags = iter->flags; + __entry->ops = ops; + __entry->caller = caller; + ), + TP_printk("dev %d:%d ino 0x%llx pos %lld length %lld flags %s (0x%x) ops %ps caller %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->pos, + __entry->length, + __print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS), + __entry->flags, + __entry->ops, + (void *)__entry->caller) +); + #endif /* _IOMAP_TRACE_H */ #undef TRACE_INCLUDE_PATH diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 76bfc5d16ef4..aac4176ea164 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -161,6 +161,62 @@ struct iomap_ops { ssize_t written, unsigned flags, struct iomap *iomap); }; +/** + * struct iomap_iter - Iterate through a range of a file + * @inode: Set at the start of the iteration and should not change. + * @pos: The current file position we are operating on. It is updated by + * calls to iomap_iter(). Treat as read-only in the body. + * @len: The remaining length of the file segment we're operating on. + * It is updated at the same time as @pos. + * @processed: The number of bytes processed by the body in the most recent + * iteration, or a negative errno. 0 causes the iteration to stop. + * @flags: Zero or more of the iomap_begin flags above. + * @iomap: Map describing the I/O iteration + * @srcmap: Source map for COW operations + */ +struct iomap_iter { + struct inode *inode; + loff_t pos; + u64 len; + s64 processed; + unsigned flags; + struct iomap iomap; + struct iomap srcmap; +}; + +int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops); + +/** + * iomap_length - length of the current iomap iteration + * @iter: iteration structure + * + * Returns the length that the operation applies to for the current iteration. + */ +static inline u64 iomap_length(const struct iomap_iter *iter) +{ + u64 end = iter->iomap.offset + iter->iomap.length; + + if (iter->srcmap.type != IOMAP_HOLE) + end = min(end, iter->srcmap.offset + iter->srcmap.length); + return min(iter->len, end - iter->pos); +} + +/** + * iomap_iter_srcmap - return the source map for the current iomap iteration + * @i: iteration structure + * + * Write operations on file systems with reflink support might require a + * source and a destination map. This function retourns the source map + * for a given operation, which may or may no be identical to the destination + * map in &i->iomap. + */ +static inline struct iomap *iomap_iter_srcmap(struct iomap_iter *i) +{ + if (i->srcmap.type != IOMAP_HOLE) + return &i->srcmap; + return &i->iomap; +} + /* * Main iomap iterator function. */ From f6d480006cea3fa1188931fe9751255f13365c4e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 18:33:08 -0700 Subject: [PATCH 20/41] iomap: switch readahead and readpage to use iomap_iter Switch the page cache read functions to use iomap_iter instead of iomap_apply. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 80 +++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 43 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 8418dffe8acf..32d80350bb55 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -241,11 +241,12 @@ static inline bool iomap_block_needs_zeroing(struct inode *inode, pos >= i_size_read(inode); } -static loff_t -iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_readpage_iter(struct iomap_iter *iter, + struct iomap_readpage_ctx *ctx, loff_t offset) { - struct iomap_readpage_ctx *ctx = data; + struct iomap *iomap = &iter->iomap; + loff_t pos = iter->pos + offset; + loff_t length = iomap_length(iter) - offset; struct page *page = ctx->cur_page; struct iomap_page *iop; loff_t orig_pos = pos; @@ -253,15 +254,16 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, sector_t sector; if (iomap->type == IOMAP_INLINE) - return min(iomap_read_inline_data(inode, page, iomap), length); + return min(iomap_read_inline_data(iter->inode, page, iomap), + length); /* zero post-eof blocks as the page may be mapped */ - iop = iomap_page_create(inode, page); - iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen); + iop = iomap_page_create(iter->inode, page); + iomap_adjust_read_range(iter->inode, iop, &pos, length, &poff, &plen); if (plen == 0) goto done; - if (iomap_block_needs_zeroing(inode, iomap, pos)) { + if (iomap_block_needs_zeroing(iter->inode, iomap, pos)) { zero_user(page, poff, plen); iomap_set_range_uptodate(page, poff, plen); goto done; @@ -313,23 +315,23 @@ done: int iomap_readpage(struct page *page, const struct iomap_ops *ops) { - struct iomap_readpage_ctx ctx = { .cur_page = page }; - struct inode *inode = page->mapping->host; - unsigned poff; - loff_t ret; + struct iomap_iter iter = { + .inode = page->mapping->host, + .pos = page_offset(page), + .len = PAGE_SIZE, + }; + struct iomap_readpage_ctx ctx = { + .cur_page = page, + }; + int ret; trace_iomap_readpage(page->mapping->host, 1); - for (poff = 0; poff < PAGE_SIZE; poff += ret) { - ret = iomap_apply(inode, page_offset(page) + poff, - PAGE_SIZE - poff, 0, ops, &ctx, - iomap_readpage_actor); - if (ret <= 0) { - WARN_ON_ONCE(ret == 0); - SetPageError(page); - break; - } - } + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = iomap_readpage_iter(&iter, &ctx, 0); + + if (ret < 0) + SetPageError(page); if (ctx.bio) { submit_bio(ctx.bio); @@ -348,15 +350,14 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops) } EXPORT_SYMBOL_GPL(iomap_readpage); -static loff_t -iomap_readahead_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_readahead_iter(struct iomap_iter *iter, + struct iomap_readpage_ctx *ctx) { - struct iomap_readpage_ctx *ctx = data; + loff_t length = iomap_length(iter); loff_t done, ret; for (done = 0; done < length; done += ret) { - if (ctx->cur_page && offset_in_page(pos + done) == 0) { + if (ctx->cur_page && offset_in_page(iter->pos + done) == 0) { if (!ctx->cur_page_in_bio) unlock_page(ctx->cur_page); put_page(ctx->cur_page); @@ -366,8 +367,7 @@ iomap_readahead_actor(struct inode *inode, loff_t pos, loff_t length, ctx->cur_page = readahead_page(ctx->rac); ctx->cur_page_in_bio = false; } - ret = iomap_readpage_actor(inode, pos + done, length - done, - ctx, iomap, srcmap); + ret = iomap_readpage_iter(iter, ctx, done); } return done; @@ -390,25 +390,19 @@ iomap_readahead_actor(struct inode *inode, loff_t pos, loff_t length, */ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) { - struct inode *inode = rac->mapping->host; - loff_t pos = readahead_pos(rac); - size_t length = readahead_length(rac); + struct iomap_iter iter = { + .inode = rac->mapping->host, + .pos = readahead_pos(rac), + .len = readahead_length(rac), + }; struct iomap_readpage_ctx ctx = { .rac = rac, }; - trace_iomap_readahead(inode, readahead_count(rac)); + trace_iomap_readahead(rac->mapping->host, readahead_count(rac)); - while (length > 0) { - ssize_t ret = iomap_apply(inode, pos, length, 0, ops, - &ctx, iomap_readahead_actor); - if (ret <= 0) { - WARN_ON_ONCE(ret == 0); - break; - } - pos += ret; - length -= ret; - } + while (iomap_iter(&iter, ops) > 0) + iter.processed = iomap_readahead_iter(&iter, &ctx); if (ctx.bio) submit_bio(ctx.bio); From ce83a0251c6ec2152f3449484d22e87f467c4a66 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 18:33:08 -0700 Subject: [PATCH 21/41] iomap: switch iomap_file_buffered_write to use iomap_iter Switch iomap_file_buffered_write to use iomap_iter. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 49 +++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 32d80350bb55..a151b3b49038 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -726,13 +726,14 @@ static size_t iomap_write_end(struct inode *inode, loff_t pos, size_t len, return ret; } -static loff_t -iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) { - struct iov_iter *i = data; - long status = 0; + struct iomap *srcmap = iomap_iter_srcmap(iter); + struct iomap *iomap = &iter->iomap; + loff_t length = iomap_length(iter); + loff_t pos = iter->pos; ssize_t written = 0; + long status = 0; do { struct page *page; @@ -758,18 +759,18 @@ again: break; } - status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap, - srcmap); + status = iomap_write_begin(iter->inode, pos, bytes, 0, &page, + iomap, srcmap); if (unlikely(status)) break; - if (mapping_writably_mapped(inode->i_mapping)) + if (mapping_writably_mapped(iter->inode->i_mapping)) flush_dcache_page(page); copied = copy_page_from_iter_atomic(page, offset, bytes, i); - status = iomap_write_end(inode, pos, bytes, copied, page, iomap, - srcmap); + status = iomap_write_end(iter->inode, pos, bytes, copied, page, + iomap, srcmap); if (unlikely(copied != status)) iov_iter_revert(i, copied - status); @@ -790,29 +791,29 @@ again: written += status; length -= status; - balance_dirty_pages_ratelimited(inode->i_mapping); + balance_dirty_pages_ratelimited(iter->inode->i_mapping); } while (iov_iter_count(i) && length); return written ? written : status; } ssize_t -iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter, +iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, const struct iomap_ops *ops) { - struct inode *inode = iocb->ki_filp->f_mapping->host; - loff_t pos = iocb->ki_pos, ret = 0, written = 0; + struct iomap_iter iter = { + .inode = iocb->ki_filp->f_mapping->host, + .pos = iocb->ki_pos, + .len = iov_iter_count(i), + .flags = IOMAP_WRITE, + }; + int ret; - while (iov_iter_count(iter)) { - ret = iomap_apply(inode, pos, iov_iter_count(iter), - IOMAP_WRITE, ops, iter, iomap_write_actor); - if (ret <= 0) - break; - pos += ret; - written += ret; - } - - return written ? written : ret; + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = iomap_write_iter(&iter, i); + if (iter.pos == iocb->ki_pos) + return ret; + return iter.pos - iocb->ki_pos; } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); From 8fc274d1f4b4fe629da3b84b6e5a7ef08a91df49 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 18:33:09 -0700 Subject: [PATCH 22/41] iomap: switch iomap_file_unshare to use iomap_iter Switch iomap_file_unshare to use iomap_iter. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index a151b3b49038..1a334cd0a36d 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -817,10 +817,12 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); -static loff_t -iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_unshare_iter(struct iomap_iter *iter) { + struct iomap *iomap = &iter->iomap; + struct iomap *srcmap = iomap_iter_srcmap(iter); + loff_t pos = iter->pos; + loff_t length = iomap_length(iter); long status = 0; loff_t written = 0; @@ -836,12 +838,12 @@ iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data, unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length); struct page *page; - status = iomap_write_begin(inode, pos, bytes, + status = iomap_write_begin(iter->inode, pos, bytes, IOMAP_WRITE_F_UNSHARE, &page, iomap, srcmap); if (unlikely(status)) return status; - status = iomap_write_end(inode, pos, bytes, bytes, page, iomap, + status = iomap_write_end(iter->inode, pos, bytes, bytes, page, iomap, srcmap); if (WARN_ON_ONCE(status == 0)) return -EIO; @@ -852,7 +854,7 @@ iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data, written += status; length -= status; - balance_dirty_pages_ratelimited(inode->i_mapping); + balance_dirty_pages_ratelimited(iter->inode->i_mapping); } while (length); return written; @@ -862,18 +864,17 @@ int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops) { - loff_t ret; + struct iomap_iter iter = { + .inode = inode, + .pos = pos, + .len = len, + .flags = IOMAP_WRITE, + }; + int ret; - while (len) { - ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL, - iomap_unshare_actor); - if (ret <= 0) - return ret; - pos += ret; - len -= ret; - } - - return 0; + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = iomap_unshare_iter(&iter); + return ret; } EXPORT_SYMBOL_GPL(iomap_file_unshare); From 2aa3048e03d38d5358be2553d4b638c1a018498c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 18:33:09 -0700 Subject: [PATCH 23/41] iomap: switch iomap_zero_range to use iomap_iter Switch iomap_zero_range to use iomap_iter. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 1a334cd0a36d..c6b86eb686f7 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -896,11 +896,12 @@ static s64 iomap_zero(struct inode *inode, loff_t pos, u64 length, return iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap); } -static loff_t iomap_zero_range_actor(struct inode *inode, loff_t pos, - loff_t length, void *data, struct iomap *iomap, - struct iomap *srcmap) +static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) { - bool *did_zero = data; + struct iomap *iomap = &iter->iomap; + struct iomap *srcmap = iomap_iter_srcmap(iter); + loff_t pos = iter->pos; + loff_t length = iomap_length(iter); loff_t written = 0; /* already zeroed? we're done. */ @@ -910,10 +911,11 @@ static loff_t iomap_zero_range_actor(struct inode *inode, loff_t pos, do { s64 bytes; - if (IS_DAX(inode)) + if (IS_DAX(iter->inode)) bytes = dax_iomap_zero(pos, length, iomap); else - bytes = iomap_zero(inode, pos, length, iomap, srcmap); + bytes = iomap_zero(iter->inode, pos, length, iomap, + srcmap); if (bytes < 0) return bytes; @@ -931,19 +933,17 @@ int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops) { - loff_t ret; + struct iomap_iter iter = { + .inode = inode, + .pos = pos, + .len = len, + .flags = IOMAP_ZERO, + }; + int ret; - while (len > 0) { - ret = iomap_apply(inode, pos, len, IOMAP_ZERO, - ops, did_zero, iomap_zero_range_actor); - if (ret <= 0) - return ret; - - pos += ret; - len -= ret; - } - - return 0; + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = iomap_zero_iter(&iter, did_zero); + return ret; } EXPORT_SYMBOL_GPL(iomap_zero_range); From 253564bafff31382b412839b0e1bb44c19c51172 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 18:33:09 -0700 Subject: [PATCH 24/41] iomap: switch iomap_page_mkwrite to use iomap_iter Switch iomap_page_mkwrite to use iomap_iter. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index c6b86eb686f7..59db1e30a666 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -961,15 +961,15 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, } EXPORT_SYMBOL_GPL(iomap_truncate_page); -static loff_t -iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_page_mkwrite_iter(struct iomap_iter *iter, + struct page *page) { - struct page *page = data; + loff_t length = iomap_length(iter); int ret; - if (iomap->flags & IOMAP_F_BUFFER_HEAD) { - ret = __block_write_begin_int(page, pos, length, NULL, iomap); + if (iter->iomap.flags & IOMAP_F_BUFFER_HEAD) { + ret = __block_write_begin_int(page, iter->pos, length, NULL, + &iter->iomap); if (ret) return ret; block_commit_write(page, 0, length); @@ -983,29 +983,24 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) { + struct iomap_iter iter = { + .inode = file_inode(vmf->vma->vm_file), + .flags = IOMAP_WRITE | IOMAP_FAULT, + }; struct page *page = vmf->page; - struct inode *inode = file_inode(vmf->vma->vm_file); - unsigned long length; - loff_t offset; ssize_t ret; lock_page(page); - ret = page_mkwrite_check_truncate(page, inode); + ret = page_mkwrite_check_truncate(page, iter.inode); if (ret < 0) goto out_unlock; - length = ret; - - offset = page_offset(page); - while (length > 0) { - ret = iomap_apply(inode, offset, length, - IOMAP_WRITE | IOMAP_FAULT, ops, page, - iomap_page_mkwrite_actor); - if (unlikely(ret <= 0)) - goto out_unlock; - offset += ret; - length -= ret; - } + iter.pos = page_offset(page); + iter.len = ret; + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = iomap_page_mkwrite_iter(&iter, page); + if (ret < 0) + goto out_unlock; wait_for_stable_page(page); return VM_FAULT_LOCKED; out_unlock: From a6d3d49587d10d23189675fce11b332a915081ff Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 18:33:10 -0700 Subject: [PATCH 25/41] iomap: switch __iomap_dio_rw to use iomap_iter Switch __iomap_dio_rw to use iomap_iter. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/btrfs/inode.c | 5 +- fs/iomap/direct-io.c | 164 +++++++++++++++++++++--------------------- include/linux/iomap.h | 4 +- 3 files changed, 86 insertions(+), 87 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0117d867ecf8..3b0595e8bdd9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8194,9 +8194,10 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, return dip; } -static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, +static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter, struct bio *dio_bio, loff_t file_offset) { + struct inode *inode = iter->inode; const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const bool raid56 = (btrfs_data_alloc_profile(fs_info) & @@ -8212,7 +8213,7 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, int ret; blk_status_t status; struct btrfs_io_geometry geom; - struct btrfs_dio_data *dio_data = iomap->private; + struct btrfs_dio_data *dio_data = iter->iomap.private; struct extent_map *em = NULL; dip = btrfs_create_dio_private(dio_bio, inode, file_offset); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 41ccbfc9dc82..4ecd255e0511 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Red Hat, Inc. - * Copyright (c) 2016-2018 Christoph Hellwig. + * Copyright (c) 2016-2021 Christoph Hellwig. */ #include #include @@ -59,19 +59,17 @@ int iomap_dio_iopoll(struct kiocb *kiocb, bool spin) } EXPORT_SYMBOL_GPL(iomap_dio_iopoll); -static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap, - struct bio *bio, loff_t pos) +static void iomap_dio_submit_bio(const struct iomap_iter *iter, + struct iomap_dio *dio, struct bio *bio, loff_t pos) { atomic_inc(&dio->ref); if (dio->iocb->ki_flags & IOCB_HIPRI) bio_set_polled(bio, dio->iocb); - dio->submit.last_queue = bdev_get_queue(iomap->bdev); + dio->submit.last_queue = bdev_get_queue(iter->iomap.bdev); if (dio->dops && dio->dops->submit_io) - dio->submit.cookie = dio->dops->submit_io( - file_inode(dio->iocb->ki_filp), - iomap, bio, pos); + dio->submit.cookie = dio->dops->submit_io(iter, bio, pos); else dio->submit.cookie = submit_bio(bio); } @@ -181,24 +179,23 @@ static void iomap_dio_bio_end_io(struct bio *bio) } } -static void -iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, - unsigned len) +static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, + loff_t pos, unsigned len) { struct page *page = ZERO_PAGE(0); int flags = REQ_SYNC | REQ_IDLE; struct bio *bio; bio = bio_alloc(GFP_KERNEL, 1); - bio_set_dev(bio, iomap->bdev); - bio->bi_iter.bi_sector = iomap_sector(iomap, pos); + bio_set_dev(bio, iter->iomap.bdev); + bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos); bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; get_page(page); __bio_add_page(bio, page, len, 0); bio_set_op_attrs(bio, REQ_OP_WRITE, flags); - iomap_dio_submit_bio(dio, iomap, bio, pos); + iomap_dio_submit_bio(iter, dio, bio, pos); } /* @@ -206,8 +203,8 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, * mapping, and whether or not we want FUA. Note that we can end up * clearing the WRITE_FUA flag in the dio request. */ -static inline unsigned int -iomap_dio_bio_opflags(struct iomap_dio *dio, struct iomap *iomap, bool use_fua) +static inline unsigned int iomap_dio_bio_opflags(struct iomap_dio *dio, + const struct iomap *iomap, bool use_fua) { unsigned int opflags = REQ_SYNC | REQ_IDLE; @@ -229,13 +226,16 @@ iomap_dio_bio_opflags(struct iomap_dio *dio, struct iomap *iomap, bool use_fua) return opflags; } -static loff_t -iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, - struct iomap_dio *dio, struct iomap *iomap) +static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, + struct iomap_dio *dio) { + const struct iomap *iomap = &iter->iomap; + struct inode *inode = iter->inode; unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev)); unsigned int fs_block_size = i_blocksize(inode), pad; unsigned int align = iov_iter_alignment(dio->submit.iter); + loff_t length = iomap_length(iter); + loff_t pos = iter->pos; unsigned int bio_opf; struct bio *bio; bool need_zeroout = false; @@ -286,7 +286,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, /* zero out from the start of the block to the write offset */ pad = pos & (fs_block_size - 1); if (pad) - iomap_dio_zero(dio, iomap, pos - pad, pad); + iomap_dio_zero(iter, dio, pos - pad, pad); } /* @@ -339,7 +339,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); - iomap_dio_submit_bio(dio, iomap, bio, pos); + iomap_dio_submit_bio(iter, dio, bio, pos); pos += n; } while (nr_pages); @@ -355,7 +355,7 @@ zero_tail: /* zero out from the end of the write to the end of the block */ pad = pos & (fs_block_size - 1); if (pad) - iomap_dio_zero(dio, iomap, pos, fs_block_size - pad); + iomap_dio_zero(iter, dio, pos, fs_block_size - pad); } out: /* Undo iter limitation to current extent */ @@ -365,35 +365,38 @@ out: return ret; } -static loff_t -iomap_dio_hole_actor(loff_t length, struct iomap_dio *dio) +static loff_t iomap_dio_hole_iter(const struct iomap_iter *iter, + struct iomap_dio *dio) { - length = iov_iter_zero(length, dio->submit.iter); + loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter); + dio->size += length; return length; } -static loff_t -iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length, - struct iomap_dio *dio, struct iomap *iomap) +static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi, + struct iomap_dio *dio) { + const struct iomap *iomap = &iomi->iomap; struct iov_iter *iter = dio->submit.iter; - void *inline_data = iomap_inline_data(iomap, pos); + void *inline_data = iomap_inline_data(iomap, iomi->pos); + loff_t length = iomap_length(iomi); + loff_t pos = iomi->pos; size_t copied; if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap))) return -EIO; if (dio->flags & IOMAP_DIO_WRITE) { - loff_t size = inode->i_size; + loff_t size = iomi->inode->i_size; if (pos > size) memset(iomap_inline_data(iomap, size), 0, pos - size); copied = copy_from_iter(inline_data, length, iter); if (copied) { if (pos + copied > size) - i_size_write(inode, pos + copied); - mark_inode_dirty(inode); + i_size_write(iomi->inode, pos + copied); + mark_inode_dirty(iomi->inode); } } else { copied = copy_to_iter(inline_data, length, iter); @@ -402,30 +405,27 @@ iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length, return copied; } -static loff_t -iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_dio_iter(const struct iomap_iter *iter, + struct iomap_dio *dio) { - struct iomap_dio *dio = data; - - switch (iomap->type) { + switch (iter->iomap.type) { case IOMAP_HOLE: if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE)) return -EIO; - return iomap_dio_hole_actor(length, dio); + return iomap_dio_hole_iter(iter, dio); case IOMAP_UNWRITTEN: if (!(dio->flags & IOMAP_DIO_WRITE)) - return iomap_dio_hole_actor(length, dio); - return iomap_dio_bio_actor(inode, pos, length, dio, iomap); + return iomap_dio_hole_iter(iter, dio); + return iomap_dio_bio_iter(iter, dio); case IOMAP_MAPPED: - return iomap_dio_bio_actor(inode, pos, length, dio, iomap); + return iomap_dio_bio_iter(iter, dio); case IOMAP_INLINE: - return iomap_dio_inline_actor(inode, pos, length, dio, iomap); + return iomap_dio_inline_iter(iter, dio); case IOMAP_DELALLOC: /* * DIO is not serialised against mmap() access at all, and so * if the page_mkwrite occurs between the writeback and the - * iomap_apply() call in the DIO path, then it will see the + * iomap_iter() call in the DIO path, then it will see the * DELALLOC block that the page-mkwrite allocated. */ pr_warn_ratelimited("Direct I/O collision with buffered writes! File: %pD4 Comm: %.20s\n", @@ -456,16 +456,19 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = file_inode(iocb->ki_filp); - size_t count = iov_iter_count(iter); - loff_t pos = iocb->ki_pos; - loff_t end = iocb->ki_pos + count - 1, ret = 0; + struct iomap_iter iomi = { + .inode = inode, + .pos = iocb->ki_pos, + .len = iov_iter_count(iter), + .flags = IOMAP_DIRECT, + }; + loff_t end = iomi.pos + iomi.len - 1, ret = 0; bool wait_for_completion = is_sync_kiocb(iocb) || (dio_flags & IOMAP_DIO_FORCE_WAIT); - unsigned int iomap_flags = IOMAP_DIRECT; struct blk_plug plug; struct iomap_dio *dio; - if (!count) + if (!iomi.len) return NULL; dio = kmalloc(sizeof(*dio), GFP_KERNEL); @@ -486,29 +489,30 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->submit.last_queue = NULL; if (iov_iter_rw(iter) == READ) { - if (pos >= dio->i_size) + if (iomi.pos >= dio->i_size) goto out_free_dio; if (iocb->ki_flags & IOCB_NOWAIT) { - if (filemap_range_needs_writeback(mapping, pos, end)) { + if (filemap_range_needs_writeback(mapping, iomi.pos, + end)) { ret = -EAGAIN; goto out_free_dio; } - iomap_flags |= IOMAP_NOWAIT; + iomi.flags |= IOMAP_NOWAIT; } if (iter_is_iovec(iter)) dio->flags |= IOMAP_DIO_DIRTY; } else { - iomap_flags |= IOMAP_WRITE; + iomi.flags |= IOMAP_WRITE; dio->flags |= IOMAP_DIO_WRITE; if (iocb->ki_flags & IOCB_NOWAIT) { - if (filemap_range_has_page(mapping, pos, end)) { + if (filemap_range_has_page(mapping, iomi.pos, end)) { ret = -EAGAIN; goto out_free_dio; } - iomap_flags |= IOMAP_NOWAIT; + iomi.flags |= IOMAP_NOWAIT; } /* for data sync or sync, we need sync completion processing */ @@ -527,12 +531,13 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) { ret = -EAGAIN; - if (pos >= dio->i_size || pos + count > dio->i_size) + if (iomi.pos >= dio->i_size || + iomi.pos + iomi.len > dio->i_size) goto out_free_dio; - iomap_flags |= IOMAP_OVERWRITE_ONLY; + iomi.flags |= IOMAP_OVERWRITE_ONLY; } - ret = filemap_write_and_wait_range(mapping, pos, end); + ret = filemap_write_and_wait_range(mapping, iomi.pos, end); if (ret) goto out_free_dio; @@ -542,9 +547,10 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, * If this invalidation fails, let the caller fall back to * buffered I/O. */ - if (invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, - end >> PAGE_SHIFT)) { - trace_iomap_dio_invalidate_fail(inode, pos, count); + if (invalidate_inode_pages2_range(mapping, + iomi.pos >> PAGE_SHIFT, end >> PAGE_SHIFT)) { + trace_iomap_dio_invalidate_fail(inode, iomi.pos, + iomi.len); ret = -ENOTBLK; goto out_free_dio; } @@ -559,31 +565,23 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, inode_dio_begin(inode); blk_start_plug(&plug); - do { - ret = iomap_apply(inode, pos, count, iomap_flags, ops, dio, - iomap_dio_actor); - if (ret <= 0) { - /* magic error code to fall back to buffered I/O */ - if (ret == -ENOTBLK) { - wait_for_completion = true; - ret = 0; - } - break; - } - pos += ret; - - if (iov_iter_rw(iter) == READ && pos >= dio->i_size) { - /* - * We only report that we've read data up to i_size. - * Revert iter to a state corresponding to that as - * some callers (such as splice code) rely on it. - */ - iov_iter_revert(iter, pos - dio->i_size); - break; - } - } while ((count = iov_iter_count(iter)) > 0); + while ((ret = iomap_iter(&iomi, ops)) > 0) + iomi.processed = iomap_dio_iter(&iomi, dio); blk_finish_plug(&plug); + /* + * We only report that we've read data up to i_size. + * Revert iter to a state corresponding to that as some callers (such + * as the splice code) rely on it. + */ + if (iov_iter_rw(iter) == READ && iomi.pos >= dio->i_size) + iov_iter_revert(iter, iomi.pos - dio->i_size); + + /* magic error code to fall back to buffered I/O */ + if (ret == -ENOTBLK) { + wait_for_completion = true; + ret = 0; + } if (ret < 0) iomap_dio_set_error(dio, ret); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index aac4176ea164..66e04aedd2ca 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -322,8 +322,8 @@ int iomap_writepages(struct address_space *mapping, struct iomap_dio_ops { int (*end_io)(struct kiocb *iocb, ssize_t size, int error, unsigned flags); - blk_qc_t (*submit_io)(struct inode *inode, struct iomap *iomap, - struct bio *bio, loff_t file_offset); + blk_qc_t (*submit_io)(const struct iomap_iter *iter, struct bio *bio, + loff_t file_offset); }; /* From 7892386d35715d14c469ec98b6deab037e2e2232 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 18:33:10 -0700 Subject: [PATCH 26/41] iomap: switch iomap_fiemap to use iomap_iter Rewrite the ->fiemap implementation based on iomap_iter. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/fiemap.c | 70 ++++++++++++++++++++--------------------------- 1 file changed, 29 insertions(+), 41 deletions(-) diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index aab070df4a21..acad09a8c188 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (c) 2016-2018 Christoph Hellwig. + * Copyright (c) 2016-2021 Christoph Hellwig. */ #include #include @@ -8,13 +8,8 @@ #include #include -struct fiemap_ctx { - struct fiemap_extent_info *fi; - struct iomap prev; -}; - static int iomap_to_fiemap(struct fiemap_extent_info *fi, - struct iomap *iomap, u32 flags) + const struct iomap *iomap, u32 flags) { switch (iomap->type) { case IOMAP_HOLE: @@ -43,24 +38,22 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, iomap->length, flags); } -static loff_t -iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_fiemap_iter(const struct iomap_iter *iter, + struct fiemap_extent_info *fi, struct iomap *prev) { - struct fiemap_ctx *ctx = data; - loff_t ret = length; + int ret; - if (iomap->type == IOMAP_HOLE) - return length; + if (iter->iomap.type == IOMAP_HOLE) + return iomap_length(iter); - ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0); - ctx->prev = *iomap; + ret = iomap_to_fiemap(fi, prev, 0); + *prev = iter->iomap; switch (ret) { case 0: /* success */ - return length; + return iomap_length(iter); case 1: /* extent array full */ return 0; - default: + default: /* error */ return ret; } } @@ -68,38 +61,33 @@ iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, u64 start, u64 len, const struct iomap_ops *ops) { - struct fiemap_ctx ctx; - loff_t ret; + struct iomap_iter iter = { + .inode = inode, + .pos = start, + .len = len, + .flags = IOMAP_REPORT, + }; + struct iomap prev = { + .type = IOMAP_HOLE, + }; + int ret; - memset(&ctx, 0, sizeof(ctx)); - ctx.fi = fi; - ctx.prev.type = IOMAP_HOLE; - - ret = fiemap_prep(inode, fi, start, &len, 0); + ret = fiemap_prep(inode, fi, start, &iter.len, 0); if (ret) return ret; - while (len > 0) { - ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx, - iomap_fiemap_actor); - /* inode with no (attribute) mapping will give ENOENT */ - if (ret == -ENOENT) - break; - if (ret < 0) - return ret; - if (ret == 0) - break; + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = iomap_fiemap_iter(&iter, fi, &prev); - start += ret; - len -= ret; - } - - if (ctx.prev.type != IOMAP_HOLE) { - ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST); + if (prev.type != IOMAP_HOLE) { + ret = iomap_to_fiemap(fi, &prev, FIEMAP_EXTENT_LAST); if (ret < 0) return ret; } + /* inode with no (attribute) mapping will give ENOENT */ + if (ret < 0 && ret != -ENOENT) + return ret; return 0; } EXPORT_SYMBOL_GPL(iomap_fiemap); From 6d8a1287a48909dbf542470aa2ca1ef7ceab3fc1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 18:33:11 -0700 Subject: [PATCH 27/41] iomap: switch iomap_bmap to use iomap_iter Rewrite the ->bmap implementation based on iomap_iter. Signed-off-by: Christoph Hellwig [djwong: restructure the loop to make its behavior a little clearer] Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/iomap/fiemap.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index acad09a8c188..66cf267c68ae 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -92,37 +92,32 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, } EXPORT_SYMBOL_GPL(iomap_fiemap); -static loff_t -iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap, struct iomap *srcmap) -{ - sector_t *bno = data, addr; - - if (iomap->type == IOMAP_MAPPED) { - addr = (pos - iomap->offset + iomap->addr) >> inode->i_blkbits; - *bno = addr; - } - return 0; -} - /* legacy ->bmap interface. 0 is the error return (!) */ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, const struct iomap_ops *ops) { - struct inode *inode = mapping->host; - loff_t pos = bno << inode->i_blkbits; - unsigned blocksize = i_blocksize(inode); + struct iomap_iter iter = { + .inode = mapping->host, + .pos = (loff_t)bno << mapping->host->i_blkbits, + .len = i_blocksize(mapping->host), + .flags = IOMAP_REPORT, + }; + const unsigned int blkshift = mapping->host->i_blkbits - SECTOR_SHIFT; int ret; if (filemap_write_and_wait(mapping)) return 0; bno = 0; - ret = iomap_apply(inode, pos, blocksize, 0, ops, &bno, - iomap_bmap_actor); + while ((ret = iomap_iter(&iter, ops)) > 0) { + if (iter.iomap.type == IOMAP_MAPPED) + bno = iomap_sector(&iter.iomap, iter.pos) >> blkshift; + /* leave iter.processed unset to abort loop */ + } if (ret) return 0; + return bno; } EXPORT_SYMBOL_GPL(iomap_bmap); From 40670d18e878160a170ba135c5d077471d7a9998 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 18:33:11 -0700 Subject: [PATCH 28/41] iomap: switch iomap_seek_hole to use iomap_iter Rewrite iomap_seek_hole to use iomap_iter. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/seek.c | 51 +++++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c index ce6fb810854f..fed8f9005f9e 100644 --- a/fs/iomap/seek.c +++ b/fs/iomap/seek.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2017 Red Hat, Inc. - * Copyright (c) 2018 Christoph Hellwig. + * Copyright (c) 2018-2021 Christoph Hellwig. */ #include #include @@ -10,21 +10,20 @@ #include #include -static loff_t -iomap_seek_hole_actor(struct inode *inode, loff_t start, loff_t length, - void *data, struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_seek_hole_iter(const struct iomap_iter *iter, + loff_t *hole_pos) { - loff_t offset = start; + loff_t length = iomap_length(iter); - switch (iomap->type) { + switch (iter->iomap.type) { case IOMAP_UNWRITTEN: - offset = mapping_seek_hole_data(inode->i_mapping, start, - start + length, SEEK_HOLE); - if (offset == start + length) + *hole_pos = mapping_seek_hole_data(iter->inode->i_mapping, + iter->pos, iter->pos + length, SEEK_HOLE); + if (*hole_pos == iter->pos + length) return length; - fallthrough; + return 0; case IOMAP_HOLE: - *(loff_t *)data = offset; + *hole_pos = iter->pos; return 0; default: return length; @@ -32,26 +31,28 @@ iomap_seek_hole_actor(struct inode *inode, loff_t start, loff_t length, } loff_t -iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops) +iomap_seek_hole(struct inode *inode, loff_t pos, const struct iomap_ops *ops) { loff_t size = i_size_read(inode); - loff_t ret; + struct iomap_iter iter = { + .inode = inode, + .pos = pos, + .flags = IOMAP_REPORT, + }; + int ret; /* Nothing to be found before or beyond the end of the file. */ - if (offset < 0 || offset >= size) + if (pos < 0 || pos >= size) return -ENXIO; - while (offset < size) { - ret = iomap_apply(inode, offset, size - offset, IOMAP_REPORT, - ops, &offset, iomap_seek_hole_actor); - if (ret < 0) - return ret; - if (ret == 0) - break; - offset += ret; - } - - return offset; + iter.len = size - pos; + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = iomap_seek_hole_iter(&iter, &pos); + if (ret < 0) + return ret; + if (iter.len) /* found hole before EOF */ + return pos; + return size; } EXPORT_SYMBOL_GPL(iomap_seek_hole); From c4740bf1edad559c10b1d33c72e885b920bf6029 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 18:33:12 -0700 Subject: [PATCH 29/41] iomap: switch iomap_seek_data to use iomap_iter Rewrite iomap_seek_data to use iomap_iter. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/seek.c | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c index fed8f9005f9e..a845c012b50c 100644 --- a/fs/iomap/seek.c +++ b/fs/iomap/seek.c @@ -56,47 +56,48 @@ iomap_seek_hole(struct inode *inode, loff_t pos, const struct iomap_ops *ops) } EXPORT_SYMBOL_GPL(iomap_seek_hole); -static loff_t -iomap_seek_data_actor(struct inode *inode, loff_t start, loff_t length, - void *data, struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_seek_data_iter(const struct iomap_iter *iter, + loff_t *hole_pos) { - loff_t offset = start; + loff_t length = iomap_length(iter); - switch (iomap->type) { + switch (iter->iomap.type) { case IOMAP_HOLE: return length; case IOMAP_UNWRITTEN: - offset = mapping_seek_hole_data(inode->i_mapping, start, - start + length, SEEK_DATA); - if (offset < 0) + *hole_pos = mapping_seek_hole_data(iter->inode->i_mapping, + iter->pos, iter->pos + length, SEEK_DATA); + if (*hole_pos < 0) return length; - fallthrough; + return 0; default: - *(loff_t *)data = offset; + *hole_pos = iter->pos; return 0; } } loff_t -iomap_seek_data(struct inode *inode, loff_t offset, const struct iomap_ops *ops) +iomap_seek_data(struct inode *inode, loff_t pos, const struct iomap_ops *ops) { loff_t size = i_size_read(inode); - loff_t ret; + struct iomap_iter iter = { + .inode = inode, + .pos = pos, + .flags = IOMAP_REPORT, + }; + int ret; /* Nothing to be found before or beyond the end of the file. */ - if (offset < 0 || offset >= size) + if (pos < 0 || pos >= size) return -ENXIO; - while (offset < size) { - ret = iomap_apply(inode, offset, size - offset, IOMAP_REPORT, - ops, &offset, iomap_seek_data_actor); - if (ret < 0) - return ret; - if (ret == 0) - return offset; - offset += ret; - } - + iter.len = size - pos; + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = iomap_seek_data_iter(&iter, &pos); + if (ret < 0) + return ret; + if (iter.len) /* found data before EOF */ + return pos; /* We've reached the end of the file without finding data */ return -ENXIO; } From 3d99a1ce3854a6cee3217247ab6b2cca3985a7a2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 18:33:12 -0700 Subject: [PATCH 30/41] iomap: switch iomap_swapfile_activate to use iomap_iter Switch iomap_swapfile_activate to use iomap_iter. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/swapfile.c | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c index 6250ca6a1f85..7069606eca85 100644 --- a/fs/iomap/swapfile.c +++ b/fs/iomap/swapfile.c @@ -88,13 +88,9 @@ static int iomap_swapfile_fail(struct iomap_swapfile_info *isi, const char *str) * swap only cares about contiguous page-aligned physical extents and makes no * distinction between written and unwritten extents. */ -static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos, - loff_t count, void *data, struct iomap *iomap, - struct iomap *srcmap) +static loff_t iomap_swapfile_iter(const struct iomap_iter *iter, + struct iomap *iomap, struct iomap_swapfile_info *isi) { - struct iomap_swapfile_info *isi = data; - int error; - switch (iomap->type) { case IOMAP_MAPPED: case IOMAP_UNWRITTEN: @@ -125,12 +121,12 @@ static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos, isi->iomap.length += iomap->length; } else { /* Otherwise, add the retained iomap and store this one. */ - error = iomap_swapfile_add_extent(isi); + int error = iomap_swapfile_add_extent(isi); if (error) return error; memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); } - return count; + return iomap_length(iter); } /* @@ -141,16 +137,19 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, struct file *swap_file, sector_t *pagespan, const struct iomap_ops *ops) { + struct inode *inode = swap_file->f_mapping->host; + struct iomap_iter iter = { + .inode = inode, + .pos = 0, + .len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE), + .flags = IOMAP_REPORT, + }; struct iomap_swapfile_info isi = { .sis = sis, .lowest_ppage = (sector_t)-1ULL, .file = swap_file, }; - struct address_space *mapping = swap_file->f_mapping; - struct inode *inode = mapping->host; - loff_t pos = 0; - loff_t len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE); - loff_t ret; + int ret; /* * Persist all file mapping metadata so that we won't have any @@ -160,15 +159,10 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, if (ret) return ret; - while (len > 0) { - ret = iomap_apply(inode, pos, len, IOMAP_REPORT, - ops, &isi, iomap_swapfile_activate_actor); - if (ret <= 0) - return ret; - - pos += ret; - len -= ret; - } + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = iomap_swapfile_iter(&iter, &iter.iomap, &isi); + if (ret < 0) + return ret; if (isi.iomap.length) { ret = iomap_swapfile_add_extent(&isi); From ca289e0b95afa973d204c77a4ad5c37e06145fbf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 18:33:13 -0700 Subject: [PATCH 31/41] fsdax: switch dax_iomap_rw to use iomap_iter Switch the dax_iomap_rw implementation to use iomap_iter. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/dax.c | 49 ++++++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 4d63040fd71f..51da45301350 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1103,20 +1103,21 @@ s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap) return size; } -static loff_t -dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap, struct iomap *srcmap) +static loff_t dax_iomap_iter(const struct iomap_iter *iomi, + struct iov_iter *iter) { + const struct iomap *iomap = &iomi->iomap; + loff_t length = iomap_length(iomi); + loff_t pos = iomi->pos; struct block_device *bdev = iomap->bdev; struct dax_device *dax_dev = iomap->dax_dev; - struct iov_iter *iter = data; loff_t end = pos + length, done = 0; ssize_t ret = 0; size_t xfer; int id; if (iov_iter_rw(iter) == READ) { - end = min(end, i_size_read(inode)); + end = min(end, i_size_read(iomi->inode)); if (pos >= end) return 0; @@ -1133,7 +1134,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, * written by write(2) is visible in mmap. */ if (iomap->flags & IOMAP_F_NEW) { - invalidate_inode_pages2_range(inode->i_mapping, + invalidate_inode_pages2_range(iomi->inode->i_mapping, pos >> PAGE_SHIFT, (end - 1) >> PAGE_SHIFT); } @@ -1209,31 +1210,29 @@ ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops) { - struct address_space *mapping = iocb->ki_filp->f_mapping; - struct inode *inode = mapping->host; - loff_t pos = iocb->ki_pos, ret = 0, done = 0; - unsigned flags = 0; + struct iomap_iter iomi = { + .inode = iocb->ki_filp->f_mapping->host, + .pos = iocb->ki_pos, + .len = iov_iter_count(iter), + }; + loff_t done = 0; + int ret; if (iov_iter_rw(iter) == WRITE) { - lockdep_assert_held_write(&inode->i_rwsem); - flags |= IOMAP_WRITE; + lockdep_assert_held_write(&iomi.inode->i_rwsem); + iomi.flags |= IOMAP_WRITE; } else { - lockdep_assert_held(&inode->i_rwsem); + lockdep_assert_held(&iomi.inode->i_rwsem); } if (iocb->ki_flags & IOCB_NOWAIT) - flags |= IOMAP_NOWAIT; + iomi.flags |= IOMAP_NOWAIT; - while (iov_iter_count(iter)) { - ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops, - iter, dax_iomap_actor); - if (ret <= 0) - break; - pos += ret; - done += ret; - } + while ((ret = iomap_iter(&iomi, ops)) > 0) + iomi.processed = dax_iomap_iter(&iomi, iter); - iocb->ki_pos += done; + done = iomi.pos - iocb->ki_pos; + iocb->ki_pos = iomi.pos; return done ? done : ret; } EXPORT_SYMBOL_GPL(dax_iomap_rw); @@ -1307,7 +1306,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, } /* - * Note that we don't bother to use iomap_apply here: DAX required + * Note that we don't bother to use iomap_iter here: DAX required * the file system block size to be equal the page size, which means * that we never have to deal with more than a single extent here. */ @@ -1561,7 +1560,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, } /* - * Note that we don't use iomap_apply here. We aren't doing I/O, only + * Note that we don't use iomap_iter here. We aren't doing I/O, only * setting up a mapping, so really we're using iomap_begin() as a way * to look up our filesystem block. */ From 57320a01fe1ffb61c483f3734f62722f74263521 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 18:33:13 -0700 Subject: [PATCH 32/41] iomap: remove iomap_apply iomap_apply is unused now, so remove it. Signed-off-by: Christoph Hellwig [djwong: rebase this patch to preserve git history of iomap loop control] Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/iomap/apply.c | 91 ------------------------------------------- fs/iomap/trace.h | 40 ------------------- include/linux/iomap.h | 10 ----- 3 files changed, 141 deletions(-) diff --git a/fs/iomap/apply.c b/fs/iomap/apply.c index e82647aef7ea..a1c7592d2ade 100644 --- a/fs/iomap/apply.c +++ b/fs/iomap/apply.c @@ -3,101 +3,10 @@ * Copyright (C) 2010 Red Hat, Inc. * Copyright (c) 2016-2021 Christoph Hellwig. */ -#include -#include #include #include #include "trace.h" -/* - * Execute a iomap write on a segment of the mapping that spans a - * contiguous range of pages that have identical block mapping state. - * - * This avoids the need to map pages individually, do individual allocations - * for each page and most importantly avoid the need for filesystem specific - * locking per page. Instead, all the operations are amortised over the entire - * range of pages. It is assumed that the filesystems will lock whatever - * resources they require in the iomap_begin call, and release them in the - * iomap_end call. - */ -loff_t -iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, - const struct iomap_ops *ops, void *data, iomap_actor_t actor) -{ - struct iomap iomap = { .type = IOMAP_HOLE }; - struct iomap srcmap = { .type = IOMAP_HOLE }; - loff_t written = 0, ret; - u64 end; - - trace_iomap_apply(inode, pos, length, flags, ops, actor, _RET_IP_); - - /* - * Need to map a range from start position for length bytes. This can - * span multiple pages - it is only guaranteed to return a range of a - * single type of pages (e.g. all into a hole, all mapped or all - * unwritten). Failure at this point has nothing to undo. - * - * If allocation is required for this range, reserve the space now so - * that the allocation is guaranteed to succeed later on. Once we copy - * the data into the page cache pages, then we cannot fail otherwise we - * expose transient stale data. If the reserve fails, we can safely - * back out at this point as there is nothing to undo. - */ - ret = ops->iomap_begin(inode, pos, length, flags, &iomap, &srcmap); - if (ret) - return ret; - if (WARN_ON(iomap.offset > pos)) { - written = -EIO; - goto out; - } - if (WARN_ON(iomap.length == 0)) { - written = -EIO; - goto out; - } - - trace_iomap_apply_dstmap(inode, &iomap); - if (srcmap.type != IOMAP_HOLE) - trace_iomap_apply_srcmap(inode, &srcmap); - - /* - * Cut down the length to the one actually provided by the filesystem, - * as it might not be able to give us the whole size that we requested. - */ - end = iomap.offset + iomap.length; - if (srcmap.type != IOMAP_HOLE) - end = min(end, srcmap.offset + srcmap.length); - if (pos + length > end) - length = end - pos; - - /* - * Now that we have guaranteed that the space allocation will succeed, - * we can do the copy-in page by page without having to worry about - * failures exposing transient data. - * - * To support COW operations, we read in data for partially blocks from - * the srcmap if the file system filled it in. In that case we the - * length needs to be limited to the earlier of the ends of the iomaps. - * If the file system did not provide a srcmap we pass in the normal - * iomap into the actors so that they don't need to have special - * handling for the two cases. - */ - written = actor(inode, pos, length, data, &iomap, - srcmap.type != IOMAP_HOLE ? &srcmap : &iomap); - -out: - /* - * Now the data has been copied, commit the range we've copied. This - * should not fail unless the filesystem has had a fatal error. - */ - if (ops->iomap_end) { - ret = ops->iomap_end(inode, pos, length, - written > 0 ? written : 0, - flags, &iomap); - } - - return written ? written : ret; -} - static inline int iomap_iter_advance(struct iomap_iter *iter) { /* handle the previous iteration (if any) */ diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 1012d7af6b68..f1519f9a1403 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -138,49 +138,9 @@ DECLARE_EVENT_CLASS(iomap_class, DEFINE_EVENT(iomap_class, name, \ TP_PROTO(struct inode *inode, struct iomap *iomap), \ TP_ARGS(inode, iomap)) -DEFINE_IOMAP_EVENT(iomap_apply_dstmap); -DEFINE_IOMAP_EVENT(iomap_apply_srcmap); DEFINE_IOMAP_EVENT(iomap_iter_dstmap); DEFINE_IOMAP_EVENT(iomap_iter_srcmap); -TRACE_EVENT(iomap_apply, - TP_PROTO(struct inode *inode, loff_t pos, loff_t length, - unsigned int flags, const void *ops, void *actor, - unsigned long caller), - TP_ARGS(inode, pos, length, flags, ops, actor, caller), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(u64, ino) - __field(loff_t, pos) - __field(loff_t, length) - __field(unsigned int, flags) - __field(const void *, ops) - __field(void *, actor) - __field(unsigned long, caller) - ), - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->pos = pos; - __entry->length = length; - __entry->flags = flags; - __entry->ops = ops; - __entry->actor = actor; - __entry->caller = caller; - ), - TP_printk("dev %d:%d ino 0x%llx pos %lld length %lld flags %s (0x%x) " - "ops %ps caller %pS actor %ps", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __entry->pos, - __entry->length, - __print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS), - __entry->flags, - __entry->ops, - (void *)__entry->caller, - __entry->actor) -); - TRACE_EVENT(iomap_iter, TP_PROTO(struct iomap_iter *iter, const void *ops, unsigned long caller), diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 66e04aedd2ca..6784a8b64714 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -217,16 +217,6 @@ static inline struct iomap *iomap_iter_srcmap(struct iomap_iter *i) return &i->iomap; } -/* - * Main iomap iterator function. - */ -typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, - void *data, struct iomap *iomap, struct iomap *srcmap); - -loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, - unsigned flags, const struct iomap_ops *ops, void *data, - iomap_actor_t actor); - ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops); int iomap_readpage(struct page *page, const struct iomap_ops *ops); From 1b5c1e36dc0e0f15de9717e81508934cbc3daf15 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 18:33:14 -0700 Subject: [PATCH 33/41] iomap: pass an iomap_iter to various buffered I/O helpers Pass the iomap_iter structure instead of individual parameters to various internal helpers for buffered I/O. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 137 ++++++++++++++++++++--------------------- 1 file changed, 66 insertions(+), 71 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 59db1e30a666..d6d1fd0208a9 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -205,10 +205,11 @@ struct iomap_readpage_ctx { struct readahead_control *rac; }; -static loff_t iomap_read_inline_data(struct inode *inode, struct page *page, - const struct iomap *iomap) +static loff_t iomap_read_inline_data(struct iomap_iter *iter, + struct page *page) { - size_t size = i_size_read(inode) - iomap->offset; + struct iomap *iomap = iomap_iter_srcmap(iter); + size_t size = i_size_read(iter->inode) - iomap->offset; size_t poff = offset_in_page(iomap->offset); void *addr; @@ -223,7 +224,7 @@ static loff_t iomap_read_inline_data(struct inode *inode, struct page *page, if (WARN_ON_ONCE(size > iomap->length)) return -EIO; if (poff > 0) - iomap_page_create(inode, page); + iomap_page_create(iter->inode, page); addr = kmap_local_page(page) + poff; memcpy(addr, iomap->inline_data, size); @@ -233,12 +234,14 @@ static loff_t iomap_read_inline_data(struct inode *inode, struct page *page, return PAGE_SIZE - poff; } -static inline bool iomap_block_needs_zeroing(struct inode *inode, - struct iomap *iomap, loff_t pos) +static inline bool iomap_block_needs_zeroing(struct iomap_iter *iter, + loff_t pos) { - return iomap->type != IOMAP_MAPPED || - (iomap->flags & IOMAP_F_NEW) || - pos >= i_size_read(inode); + struct iomap *srcmap = iomap_iter_srcmap(iter); + + return srcmap->type != IOMAP_MAPPED || + (srcmap->flags & IOMAP_F_NEW) || + pos >= i_size_read(iter->inode); } static loff_t iomap_readpage_iter(struct iomap_iter *iter, @@ -254,8 +257,7 @@ static loff_t iomap_readpage_iter(struct iomap_iter *iter, sector_t sector; if (iomap->type == IOMAP_INLINE) - return min(iomap_read_inline_data(iter->inode, page, iomap), - length); + return min(iomap_read_inline_data(iter, page), length); /* zero post-eof blocks as the page may be mapped */ iop = iomap_page_create(iter->inode, page); @@ -263,7 +265,7 @@ static loff_t iomap_readpage_iter(struct iomap_iter *iter, if (plen == 0) goto done; - if (iomap_block_needs_zeroing(iter->inode, iomap, pos)) { + if (iomap_block_needs_zeroing(iter, pos)) { zero_user(page, poff, plen); iomap_set_range_uptodate(page, poff, plen); goto done; @@ -538,12 +540,12 @@ iomap_read_page_sync(loff_t block_start, struct page *page, unsigned poff, return submit_bio_wait(&bio); } -static int -__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, - struct page *page, struct iomap *srcmap) +static int __iomap_write_begin(struct iomap_iter *iter, loff_t pos, + unsigned len, int flags, struct page *page) { - struct iomap_page *iop = iomap_page_create(inode, page); - loff_t block_size = i_blocksize(inode); + struct iomap *srcmap = iomap_iter_srcmap(iter); + struct iomap_page *iop = iomap_page_create(iter->inode, page); + loff_t block_size = i_blocksize(iter->inode); loff_t block_start = round_down(pos, block_size); loff_t block_end = round_up(pos + len, block_size); unsigned from = offset_in_page(pos), to = from + len, poff, plen; @@ -553,7 +555,7 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, ClearPageError(page); do { - iomap_adjust_read_range(inode, iop, &block_start, + iomap_adjust_read_range(iter->inode, iop, &block_start, block_end - block_start, &poff, &plen); if (plen == 0) break; @@ -563,7 +565,7 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, (to <= poff || to >= poff + plen)) continue; - if (iomap_block_needs_zeroing(inode, srcmap, block_start)) { + if (iomap_block_needs_zeroing(iter, block_start)) { if (WARN_ON_ONCE(flags & IOMAP_WRITE_F_UNSHARE)) return -EIO; zero_user_segments(page, poff, from, to, poff + plen); @@ -579,55 +581,54 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, return 0; } -static int iomap_write_begin_inline(struct inode *inode, - struct page *page, struct iomap *srcmap) +static int iomap_write_begin_inline(struct iomap_iter *iter, + struct page *page) { int ret; /* needs more work for the tailpacking case; disable for now */ - if (WARN_ON_ONCE(srcmap->offset != 0)) + if (WARN_ON_ONCE(iomap_iter_srcmap(iter)->offset != 0)) return -EIO; - ret = iomap_read_inline_data(inode, page, srcmap); + ret = iomap_read_inline_data(iter, page); if (ret < 0) return ret; return 0; } -static int -iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, - struct page **pagep, struct iomap *iomap, struct iomap *srcmap) +static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, unsigned len, + unsigned flags, struct page **pagep) { - const struct iomap_page_ops *page_ops = iomap->page_ops; + const struct iomap_page_ops *page_ops = iter->iomap.page_ops; + struct iomap *srcmap = iomap_iter_srcmap(iter); struct page *page; int status = 0; - BUG_ON(pos + len > iomap->offset + iomap->length); - if (srcmap != iomap) + BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length); + if (srcmap != &iter->iomap) BUG_ON(pos + len > srcmap->offset + srcmap->length); if (fatal_signal_pending(current)) return -EINTR; if (page_ops && page_ops->page_prepare) { - status = page_ops->page_prepare(inode, pos, len); + status = page_ops->page_prepare(iter->inode, pos, len); if (status) return status; } - page = grab_cache_page_write_begin(inode->i_mapping, pos >> PAGE_SHIFT, - AOP_FLAG_NOFS); + page = grab_cache_page_write_begin(iter->inode->i_mapping, + pos >> PAGE_SHIFT, AOP_FLAG_NOFS); if (!page) { status = -ENOMEM; goto out_no_page; } if (srcmap->type == IOMAP_INLINE) - status = iomap_write_begin_inline(inode, page, srcmap); - else if (iomap->flags & IOMAP_F_BUFFER_HEAD) + status = iomap_write_begin_inline(iter, page); + else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) status = __block_write_begin_int(page, pos, len, NULL, srcmap); else - status = __iomap_write_begin(inode, pos, len, flags, page, - srcmap); + status = __iomap_write_begin(iter, pos, len, flags, page); if (unlikely(status)) goto out_unlock; @@ -638,11 +639,11 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, out_unlock: unlock_page(page); put_page(page); - iomap_write_failed(inode, pos, len); + iomap_write_failed(iter->inode, pos, len); out_no_page: if (page_ops && page_ops->page_done) - page_ops->page_done(inode, pos, 0, NULL); + page_ops->page_done(iter->inode, pos, 0, NULL); return status; } @@ -669,9 +670,10 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, return copied; } -static size_t iomap_write_end_inline(struct inode *inode, struct page *page, - struct iomap *iomap, loff_t pos, size_t copied) +static size_t iomap_write_end_inline(struct iomap_iter *iter, struct page *page, + loff_t pos, size_t copied) { + struct iomap *iomap = &iter->iomap; void *addr; WARN_ON_ONCE(!PageUptodate(page)); @@ -682,26 +684,26 @@ static size_t iomap_write_end_inline(struct inode *inode, struct page *page, memcpy(iomap_inline_data(iomap, pos), addr, copied); kunmap_local(addr); - mark_inode_dirty(inode); + mark_inode_dirty(iter->inode); return copied; } /* Returns the number of bytes copied. May be 0. Cannot be an errno. */ -static size_t iomap_write_end(struct inode *inode, loff_t pos, size_t len, - size_t copied, struct page *page, struct iomap *iomap, - struct iomap *srcmap) +static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, + size_t copied, struct page *page) { - const struct iomap_page_ops *page_ops = iomap->page_ops; - loff_t old_size = inode->i_size; + const struct iomap_page_ops *page_ops = iter->iomap.page_ops; + struct iomap *srcmap = iomap_iter_srcmap(iter); + loff_t old_size = iter->inode->i_size; size_t ret; if (srcmap->type == IOMAP_INLINE) { - ret = iomap_write_end_inline(inode, page, iomap, pos, copied); + ret = iomap_write_end_inline(iter, page, pos, copied); } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { - ret = block_write_end(NULL, inode->i_mapping, pos, len, copied, - page, NULL); + ret = block_write_end(NULL, iter->inode->i_mapping, pos, len, + copied, page, NULL); } else { - ret = __iomap_write_end(inode, pos, len, copied, page); + ret = __iomap_write_end(iter->inode, pos, len, copied, page); } /* @@ -710,26 +712,24 @@ static size_t iomap_write_end(struct inode *inode, loff_t pos, size_t len, * preferably after I/O completion so that no stale data is exposed. */ if (pos + ret > old_size) { - i_size_write(inode, pos + ret); - iomap->flags |= IOMAP_F_SIZE_CHANGED; + i_size_write(iter->inode, pos + ret); + iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; } unlock_page(page); if (old_size < pos) - pagecache_isize_extended(inode, old_size, pos); + pagecache_isize_extended(iter->inode, old_size, pos); if (page_ops && page_ops->page_done) - page_ops->page_done(inode, pos, ret, page); + page_ops->page_done(iter->inode, pos, ret, page); put_page(page); if (ret < len) - iomap_write_failed(inode, pos, len); + iomap_write_failed(iter->inode, pos, len); return ret; } static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) { - struct iomap *srcmap = iomap_iter_srcmap(iter); - struct iomap *iomap = &iter->iomap; loff_t length = iomap_length(iter); loff_t pos = iter->pos; ssize_t written = 0; @@ -759,8 +759,7 @@ again: break; } - status = iomap_write_begin(iter->inode, pos, bytes, 0, &page, - iomap, srcmap); + status = iomap_write_begin(iter, pos, bytes, 0, &page); if (unlikely(status)) break; @@ -769,8 +768,7 @@ again: copied = copy_page_from_iter_atomic(page, offset, bytes, i); - status = iomap_write_end(iter->inode, pos, bytes, copied, page, - iomap, srcmap); + status = iomap_write_end(iter, pos, bytes, copied, page); if (unlikely(copied != status)) iov_iter_revert(i, copied - status); @@ -838,13 +836,12 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length); struct page *page; - status = iomap_write_begin(iter->inode, pos, bytes, - IOMAP_WRITE_F_UNSHARE, &page, iomap, srcmap); + status = iomap_write_begin(iter, pos, bytes, + IOMAP_WRITE_F_UNSHARE, &page); if (unlikely(status)) return status; - status = iomap_write_end(iter->inode, pos, bytes, bytes, page, iomap, - srcmap); + status = iomap_write_end(iter, pos, bytes, bytes, page); if (WARN_ON_ONCE(status == 0)) return -EIO; @@ -878,22 +875,21 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, } EXPORT_SYMBOL_GPL(iomap_file_unshare); -static s64 iomap_zero(struct inode *inode, loff_t pos, u64 length, - struct iomap *iomap, struct iomap *srcmap) +static s64 __iomap_zero_iter(struct iomap_iter *iter, loff_t pos, u64 length) { struct page *page; int status; unsigned offset = offset_in_page(pos); unsigned bytes = min_t(u64, PAGE_SIZE - offset, length); - status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap, srcmap); + status = iomap_write_begin(iter, pos, bytes, 0, &page); if (status) return status; zero_user(page, offset, bytes); mark_page_accessed(page); - return iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap); + return iomap_write_end(iter, pos, bytes, bytes, page); } static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) @@ -914,8 +910,7 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) if (IS_DAX(iter->inode)) bytes = dax_iomap_zero(pos, length, iomap); else - bytes = iomap_zero(iter->inode, pos, length, iomap, - srcmap); + bytes = __iomap_zero_iter(iter, pos, length); if (bytes < 0) return bytes; From b74b1293e6cae70bade491067f15b9d33e040cad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 18:33:14 -0700 Subject: [PATCH 34/41] iomap: rework unshare flag Instead of another internal flags namespace inside of buffered-io.c, just pass a UNSHARE hint in the main iomap flags field. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 23 +++++++++-------------- include/linux/iomap.h | 1 + 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index d6d1fd0208a9..a0ef7ebe9209 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -508,10 +508,6 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage, EXPORT_SYMBOL_GPL(iomap_migrate_page); #endif /* CONFIG_MIGRATION */ -enum { - IOMAP_WRITE_F_UNSHARE = (1 << 0), -}; - static void iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) { @@ -541,7 +537,7 @@ iomap_read_page_sync(loff_t block_start, struct page *page, unsigned poff, } static int __iomap_write_begin(struct iomap_iter *iter, loff_t pos, - unsigned len, int flags, struct page *page) + unsigned len, struct page *page) { struct iomap *srcmap = iomap_iter_srcmap(iter); struct iomap_page *iop = iomap_page_create(iter->inode, page); @@ -560,13 +556,13 @@ static int __iomap_write_begin(struct iomap_iter *iter, loff_t pos, if (plen == 0) break; - if (!(flags & IOMAP_WRITE_F_UNSHARE) && + if (!(iter->flags & IOMAP_UNSHARE) && (from <= poff || from >= poff + plen) && (to <= poff || to >= poff + plen)) continue; if (iomap_block_needs_zeroing(iter, block_start)) { - if (WARN_ON_ONCE(flags & IOMAP_WRITE_F_UNSHARE)) + if (WARN_ON_ONCE(iter->flags & IOMAP_UNSHARE)) return -EIO; zero_user_segments(page, poff, from, to, poff + plen); } else { @@ -596,7 +592,7 @@ static int iomap_write_begin_inline(struct iomap_iter *iter, } static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, unsigned len, - unsigned flags, struct page **pagep) + struct page **pagep) { const struct iomap_page_ops *page_ops = iter->iomap.page_ops; struct iomap *srcmap = iomap_iter_srcmap(iter); @@ -628,7 +624,7 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, unsigned len, else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) status = __block_write_begin_int(page, pos, len, NULL, srcmap); else - status = __iomap_write_begin(iter, pos, len, flags, page); + status = __iomap_write_begin(iter, pos, len, page); if (unlikely(status)) goto out_unlock; @@ -759,7 +755,7 @@ again: break; } - status = iomap_write_begin(iter, pos, bytes, 0, &page); + status = iomap_write_begin(iter, pos, bytes, &page); if (unlikely(status)) break; @@ -836,8 +832,7 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length); struct page *page; - status = iomap_write_begin(iter, pos, bytes, - IOMAP_WRITE_F_UNSHARE, &page); + status = iomap_write_begin(iter, pos, bytes, &page); if (unlikely(status)) return status; @@ -865,7 +860,7 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, .inode = inode, .pos = pos, .len = len, - .flags = IOMAP_WRITE, + .flags = IOMAP_WRITE | IOMAP_UNSHARE, }; int ret; @@ -882,7 +877,7 @@ static s64 __iomap_zero_iter(struct iomap_iter *iter, loff_t pos, u64 length) unsigned offset = offset_in_page(pos); unsigned bytes = min_t(u64, PAGE_SIZE - offset, length); - status = iomap_write_begin(iter, pos, bytes, 0, &page); + status = iomap_write_begin(iter, pos, bytes, &page); if (status) return status; diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 6784a8b64714..f53c40e9d799 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -140,6 +140,7 @@ struct iomap_page_ops { #define IOMAP_DIRECT (1 << 4) /* direct I/O */ #define IOMAP_NOWAIT (1 << 5) /* do not block */ #define IOMAP_OVERWRITE_ONLY (1 << 6) /* only pure overwrites allowed */ +#define IOMAP_UNSHARE (1 << 7) /* unshare_file_range */ struct iomap_ops { /* From 55f81639a7152848f204f9af3f9b1a14a5944be1 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Tue, 10 Aug 2021 18:33:14 -0700 Subject: [PATCH 35/41] fsdax: factor out helpers to simplify the dax fault code The dax page fault code is too long and a bit difficult to read. And it is hard to understand when we trying to add new features. Some of the PTE/PMD codes have similar logic. So, factor out helper functions to simplify the code. Signed-off-by: Shiyang Ruan Reviewed-by: Ritesh Harjani Reviewed-by: Darrick J. Wong [hch: minor cleanups] Signed-off-by: Christoph Hellwig Signed-off-by: Darrick J. Wong --- fs/dax.c | 153 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 84 insertions(+), 69 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 51da45301350..c09d721629d1 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1255,6 +1255,53 @@ static bool dax_fault_is_synchronous(unsigned long flags, && (iomap->flags & IOMAP_F_DIRTY); } +/* + * When handling a synchronous page fault and the inode need a fsync, we can + * insert the PTE/PMD into page tables only after that fsync happened. Skip + * insertion for now and return the pfn so that caller can insert it after the + * fsync is done. + */ +static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn) +{ + if (WARN_ON_ONCE(!pfnp)) + return VM_FAULT_SIGBUS; + *pfnp = pfn; + return VM_FAULT_NEEDDSYNC; +} + +static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf, struct iomap *iomap, + loff_t pos) +{ + sector_t sector = dax_iomap_sector(iomap, pos); + unsigned long vaddr = vmf->address; + vm_fault_t ret; + int error = 0; + + switch (iomap->type) { + case IOMAP_HOLE: + case IOMAP_UNWRITTEN: + clear_user_highpage(vmf->cow_page, vaddr); + break; + case IOMAP_MAPPED: + error = copy_cow_page_dax(iomap->bdev, iomap->dax_dev, sector, + vmf->cow_page, vaddr); + break; + default: + WARN_ON_ONCE(1); + error = -EIO; + break; + } + + if (error) + return dax_fault_return(error); + + __SetPageUptodate(vmf->cow_page); + ret = finish_fault(vmf); + if (!ret) + return VM_FAULT_DONE_COW; + return ret; +} + static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops) { @@ -1323,30 +1370,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, } if (vmf->cow_page) { - sector_t sector = dax_iomap_sector(&iomap, pos); - - switch (iomap.type) { - case IOMAP_HOLE: - case IOMAP_UNWRITTEN: - clear_user_highpage(vmf->cow_page, vaddr); - break; - case IOMAP_MAPPED: - error = copy_cow_page_dax(iomap.bdev, iomap.dax_dev, - sector, vmf->cow_page, vaddr); - break; - default: - WARN_ON_ONCE(1); - error = -EIO; - break; - } - - if (error) - goto error_finish_iomap; - - __SetPageUptodate(vmf->cow_page); - ret = finish_fault(vmf); - if (!ret) - ret = VM_FAULT_DONE_COW; + ret = dax_fault_cow_page(vmf, &iomap, pos); goto finish_iomap; } @@ -1366,19 +1390,8 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, 0, write && !sync); - /* - * If we are doing synchronous page fault and inode needs fsync, - * we can insert PTE into page tables only after that happens. - * Skip insertion for now and return the pfn so that caller can - * insert it after fsync is done. - */ if (sync) { - if (WARN_ON_ONCE(!pfnp)) { - error = -EIO; - goto error_finish_iomap; - } - *pfnp = pfn; - ret = VM_FAULT_NEEDDSYNC | major; + ret = dax_fault_synchronous_pfnp(pfnp, pfn); goto finish_iomap; } trace_dax_insert_mapping(inode, vmf, entry); @@ -1477,13 +1490,45 @@ fallback: return VM_FAULT_FALLBACK; } +static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas, + pgoff_t max_pgoff) +{ + unsigned long pmd_addr = vmf->address & PMD_MASK; + bool write = vmf->flags & FAULT_FLAG_WRITE; + + /* + * Make sure that the faulting address's PMD offset (color) matches + * the PMD offset from the start of the file. This is necessary so + * that a PMD range in the page table overlaps exactly with a PMD + * range in the page cache. + */ + if ((vmf->pgoff & PG_PMD_COLOUR) != + ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR)) + return true; + + /* Fall back to PTEs if we're going to COW */ + if (write && !(vmf->vma->vm_flags & VM_SHARED)) + return true; + + /* If the PMD would extend outside the VMA */ + if (pmd_addr < vmf->vma->vm_start) + return true; + if ((pmd_addr + PMD_SIZE) > vmf->vma->vm_end) + return true; + + /* If the PMD would extend beyond the file size */ + if ((xas->xa_index | PG_PMD_COLOUR) >= max_pgoff) + return true; + + return false; +} + static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { struct vm_area_struct *vma = vmf->vma; struct address_space *mapping = vma->vm_file->f_mapping; XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER); - unsigned long pmd_addr = vmf->address & PMD_MASK; bool write = vmf->flags & FAULT_FLAG_WRITE; bool sync; unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; @@ -1506,33 +1551,12 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); - /* - * Make sure that the faulting address's PMD offset (color) matches - * the PMD offset from the start of the file. This is necessary so - * that a PMD range in the page table overlaps exactly with a PMD - * range in the page cache. - */ - if ((vmf->pgoff & PG_PMD_COLOUR) != - ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR)) - goto fallback; - - /* Fall back to PTEs if we're going to COW */ - if (write && !(vma->vm_flags & VM_SHARED)) - goto fallback; - - /* If the PMD would extend outside the VMA */ - if (pmd_addr < vma->vm_start) - goto fallback; - if ((pmd_addr + PMD_SIZE) > vma->vm_end) - goto fallback; - if (xas.xa_index >= max_pgoff) { result = VM_FAULT_SIGBUS; goto out; } - /* If the PMD would extend beyond the file size */ - if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff) + if (dax_fault_check_fallback(vmf, &xas, max_pgoff)) goto fallback; /* @@ -1584,17 +1608,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, DAX_PMD, write && !sync); - /* - * If we are doing synchronous page fault and inode needs fsync, - * we can insert PMD into page tables only after that happens. - * Skip insertion for now and return the pfn so that caller can - * insert it after fsync is done. - */ if (sync) { - if (WARN_ON_ONCE(!pfnp)) - goto finish_iomap; - *pfnp = pfn; - result = VM_FAULT_NEEDDSYNC; + result = dax_fault_synchronous_pfnp(pfnp, pfn); goto finish_iomap; } From c2436190e492b243235262fc080a2c3189021be9 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Tue, 10 Aug 2021 18:33:15 -0700 Subject: [PATCH 36/41] fsdax: factor out a dax_fault_actor() helper The core logic in the two dax page fault functions is similar. So, move the logic into a common helper function. Also, to facilitate the addition of new features, such as CoW, switch-case is no longer used to handle different iomap types. Signed-off-by: Shiyang Ruan Reviewed-by: Christoph Hellwig Reviewed-by: Ritesh Harjani Reviewed-by: Darrick J. Wong Signed-off-by: Christoph Hellwig Signed-off-by: Darrick J. Wong --- fs/dax.c | 297 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 149 insertions(+), 148 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index c09d721629d1..6d0c6d28be83 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1066,6 +1066,66 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, return ret; } +#ifdef CONFIG_FS_DAX_PMD +static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, + struct iomap *iomap, void **entry) +{ + struct address_space *mapping = vmf->vma->vm_file->f_mapping; + unsigned long pmd_addr = vmf->address & PMD_MASK; + struct vm_area_struct *vma = vmf->vma; + struct inode *inode = mapping->host; + pgtable_t pgtable = NULL; + struct page *zero_page; + spinlock_t *ptl; + pmd_t pmd_entry; + pfn_t pfn; + + zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm); + + if (unlikely(!zero_page)) + goto fallback; + + pfn = page_to_pfn_t(zero_page); + *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, + DAX_PMD | DAX_ZERO_PAGE, false); + + if (arch_needs_pgtable_deposit()) { + pgtable = pte_alloc_one(vma->vm_mm); + if (!pgtable) + return VM_FAULT_OOM; + } + + ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); + if (!pmd_none(*(vmf->pmd))) { + spin_unlock(ptl); + goto fallback; + } + + if (pgtable) { + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); + mm_inc_nr_ptes(vma->vm_mm); + } + pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot); + pmd_entry = pmd_mkhuge(pmd_entry); + set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); + spin_unlock(ptl); + trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry); + return VM_FAULT_NOPAGE; + +fallback: + if (pgtable) + pte_free(vma->vm_mm, pgtable); + trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry); + return VM_FAULT_FALLBACK; +} +#else +static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, + struct iomap *iomap, void **entry) +{ + return VM_FAULT_FALLBACK; +} +#endif /* CONFIG_FS_DAX_PMD */ + s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap) { sector_t sector = iomap_sector(iomap, pos & PAGE_MASK); @@ -1302,6 +1362,63 @@ static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf, struct iomap *iomap, return ret; } +/** + * dax_fault_actor - Common actor to handle pfn insertion in PTE/PMD fault. + * @vmf: vm fault instance + * @pfnp: pfn to be returned + * @xas: the dax mapping tree of a file + * @entry: an unlocked dax entry to be inserted + * @pmd: distinguish whether it is a pmd fault + * @flags: iomap flags + * @iomap: from iomap_begin() + * @srcmap: from iomap_begin(), not equal to iomap if it is a CoW + */ +static vm_fault_t dax_fault_actor(struct vm_fault *vmf, pfn_t *pfnp, + struct xa_state *xas, void **entry, bool pmd, + unsigned int flags, struct iomap *iomap, struct iomap *srcmap) +{ + struct address_space *mapping = vmf->vma->vm_file->f_mapping; + size_t size = pmd ? PMD_SIZE : PAGE_SIZE; + loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT; + bool write = vmf->flags & FAULT_FLAG_WRITE; + bool sync = dax_fault_is_synchronous(flags, vmf->vma, iomap); + unsigned long entry_flags = pmd ? DAX_PMD : 0; + int err = 0; + pfn_t pfn; + + /* if we are reading UNWRITTEN and HOLE, return a hole. */ + if (!write && + (iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) { + if (!pmd) + return dax_load_hole(xas, mapping, entry, vmf); + return dax_pmd_load_hole(xas, vmf, iomap, entry); + } + + if (iomap->type != IOMAP_MAPPED) { + WARN_ON_ONCE(1); + return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS; + } + + err = dax_iomap_pfn(iomap, pos, size, &pfn); + if (err) + return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err); + + *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, entry_flags, + write && !sync); + + if (sync) + return dax_fault_synchronous_pfnp(pfnp, pfn); + + /* insert PMD pfn */ + if (pmd) + return vmf_insert_pfn_pmd(vmf, pfn, write); + + /* insert PTE pfn */ + if (write) + return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); + return vmf_insert_mixed(vmf->vma, vmf->address, pfn); +} + static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops) { @@ -1309,17 +1426,14 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, struct address_space *mapping = vma->vm_file->f_mapping; XA_STATE(xas, &mapping->i_pages, vmf->pgoff); struct inode *inode = mapping->host; - unsigned long vaddr = vmf->address; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; struct iomap iomap = { .type = IOMAP_HOLE }; struct iomap srcmap = { .type = IOMAP_HOLE }; unsigned flags = IOMAP_FAULT; - int error, major = 0; + int error; bool write = vmf->flags & FAULT_FLAG_WRITE; - bool sync; - vm_fault_t ret = 0; + vm_fault_t ret = 0, major = 0; void *entry; - pfn_t pfn; trace_dax_pte_fault(inode, vmf, ret); /* @@ -1365,8 +1479,8 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, goto unlock_entry; } if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { - error = -EIO; /* fs corruption? */ - goto error_finish_iomap; + ret = VM_FAULT_SIGBUS; /* fs corruption? */ + goto finish_iomap; } if (vmf->cow_page) { @@ -1374,49 +1488,19 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, goto finish_iomap; } - sync = dax_fault_is_synchronous(flags, vma, &iomap); - - switch (iomap.type) { - case IOMAP_MAPPED: - if (iomap.flags & IOMAP_F_NEW) { - count_vm_event(PGMAJFAULT); - count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); - major = VM_FAULT_MAJOR; - } - error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn); - if (error < 0) - goto error_finish_iomap; - - entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, - 0, write && !sync); - - if (sync) { - ret = dax_fault_synchronous_pfnp(pfnp, pfn); - goto finish_iomap; - } - trace_dax_insert_mapping(inode, vmf, entry); - if (write) - ret = vmf_insert_mixed_mkwrite(vma, vaddr, pfn); - else - ret = vmf_insert_mixed(vma, vaddr, pfn); - + ret = dax_fault_actor(vmf, pfnp, &xas, &entry, false, flags, + &iomap, &srcmap); + if (ret == VM_FAULT_SIGBUS) goto finish_iomap; - case IOMAP_UNWRITTEN: - case IOMAP_HOLE: - if (!write) { - ret = dax_load_hole(&xas, mapping, &entry, vmf); - goto finish_iomap; - } - fallthrough; - default: - WARN_ON_ONCE(1); - error = -EIO; - break; + + /* read/write MAPPED, CoW UNWRITTEN */ + if (iomap.flags & IOMAP_F_NEW) { + count_vm_event(PGMAJFAULT); + count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); + major = VM_FAULT_MAJOR; } - error_finish_iomap: - ret = dax_fault_return(error); - finish_iomap: +finish_iomap: if (ops->iomap_end) { int copied = PAGE_SIZE; @@ -1430,66 +1514,14 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, */ ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); } - unlock_entry: +unlock_entry: dax_unlock_entry(&xas, entry); - out: +out: trace_dax_pte_fault_done(inode, vmf, ret); return ret | major; } #ifdef CONFIG_FS_DAX_PMD -static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, - struct iomap *iomap, void **entry) -{ - struct address_space *mapping = vmf->vma->vm_file->f_mapping; - unsigned long pmd_addr = vmf->address & PMD_MASK; - struct vm_area_struct *vma = vmf->vma; - struct inode *inode = mapping->host; - pgtable_t pgtable = NULL; - struct page *zero_page; - spinlock_t *ptl; - pmd_t pmd_entry; - pfn_t pfn; - - zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm); - - if (unlikely(!zero_page)) - goto fallback; - - pfn = page_to_pfn_t(zero_page); - *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, - DAX_PMD | DAX_ZERO_PAGE, false); - - if (arch_needs_pgtable_deposit()) { - pgtable = pte_alloc_one(vma->vm_mm); - if (!pgtable) - return VM_FAULT_OOM; - } - - ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); - if (!pmd_none(*(vmf->pmd))) { - spin_unlock(ptl); - goto fallback; - } - - if (pgtable) { - pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); - mm_inc_nr_ptes(vma->vm_mm); - } - pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot); - pmd_entry = pmd_mkhuge(pmd_entry); - set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); - spin_unlock(ptl); - trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry); - return VM_FAULT_NOPAGE; - -fallback: - if (pgtable) - pte_free(vma->vm_mm, pgtable); - trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry); - return VM_FAULT_FALLBACK; -} - static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas, pgoff_t max_pgoff) { @@ -1530,17 +1562,15 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, struct address_space *mapping = vma->vm_file->f_mapping; XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER); bool write = vmf->flags & FAULT_FLAG_WRITE; - bool sync; - unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; + unsigned int flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; struct inode *inode = mapping->host; - vm_fault_t result = VM_FAULT_FALLBACK; + vm_fault_t ret = VM_FAULT_FALLBACK; struct iomap iomap = { .type = IOMAP_HOLE }; struct iomap srcmap = { .type = IOMAP_HOLE }; pgoff_t max_pgoff; void *entry; loff_t pos; int error; - pfn_t pfn; /* * Check whether offset isn't beyond end of file now. Caller is @@ -1552,7 +1582,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); if (xas.xa_index >= max_pgoff) { - result = VM_FAULT_SIGBUS; + ret = VM_FAULT_SIGBUS; goto out; } @@ -1567,7 +1597,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, */ entry = grab_mapping_entry(&xas, mapping, PMD_ORDER); if (xa_is_internal(entry)) { - result = xa_to_internal(entry); + ret = xa_to_internal(entry); goto fallback; } @@ -1579,7 +1609,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, */ if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) && !pmd_devmap(*vmf->pmd)) { - result = 0; + ret = 0; goto unlock_entry; } @@ -1589,49 +1619,21 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * to look up our filesystem block. */ pos = (loff_t)xas.xa_index << PAGE_SHIFT; - error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap, - &srcmap); + error = ops->iomap_begin(inode, pos, PMD_SIZE, flags, &iomap, &srcmap); if (error) goto unlock_entry; if (iomap.offset + iomap.length < pos + PMD_SIZE) goto finish_iomap; - sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap); + ret = dax_fault_actor(vmf, pfnp, &xas, &entry, true, flags, + &iomap, &srcmap); - switch (iomap.type) { - case IOMAP_MAPPED: - error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn); - if (error < 0) - goto finish_iomap; - - entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, - DAX_PMD, write && !sync); - - if (sync) { - result = dax_fault_synchronous_pfnp(pfnp, pfn); - goto finish_iomap; - } - - trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry); - result = vmf_insert_pfn_pmd(vmf, pfn, write); - break; - case IOMAP_UNWRITTEN: - case IOMAP_HOLE: - if (WARN_ON_ONCE(write)) - break; - result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry); - break; - default: - WARN_ON_ONCE(1); - break; - } - - finish_iomap: +finish_iomap: if (ops->iomap_end) { int copied = PMD_SIZE; - if (result == VM_FAULT_FALLBACK) + if (ret == VM_FAULT_FALLBACK) copied = 0; /* * The fault is done by now and there's no way back (other @@ -1639,19 +1641,18 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * Just ignore error from ->iomap_end since we cannot do much * with it. */ - ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags, - &iomap); + ops->iomap_end(inode, pos, PMD_SIZE, copied, flags, &iomap); } - unlock_entry: +unlock_entry: dax_unlock_entry(&xas, entry); - fallback: - if (result == VM_FAULT_FALLBACK) { +fallback: + if (ret == VM_FAULT_FALLBACK) { split_huge_pmd(vma, vmf->pmd, vmf->address); count_vm_event(THP_FAULT_FALLBACK); } out: - trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result); - return result; + trace_dax_pmd_fault_done(inode, vmf, max_pgoff, ret); + return ret; } #else static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, From 65dd814a6187ff46e33718d8eb76244e027837a3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 18:33:15 -0700 Subject: [PATCH 37/41] fsdax: switch the fault handlers to use iomap_iter Avoid the open coded calls to ->iomap_begin and ->iomap_end and call iomap_iter instead. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/dax.c | 195 ++++++++++++++++++++++--------------------------------- 1 file changed, 76 insertions(+), 119 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 6d0c6d28be83..118c9e2923f5 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1010,7 +1010,7 @@ static sector_t dax_iomap_sector(const struct iomap *iomap, loff_t pos) return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9; } -static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size, +static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size, pfn_t *pfnp) { const sector_t sector = dax_iomap_sector(iomap, pos); @@ -1068,7 +1068,7 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, #ifdef CONFIG_FS_DAX_PMD static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, - struct iomap *iomap, void **entry) + const struct iomap *iomap, void **entry) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; unsigned long pmd_addr = vmf->address & PMD_MASK; @@ -1120,7 +1120,7 @@ fallback: } #else static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, - struct iomap *iomap, void **entry) + const struct iomap *iomap, void **entry) { return VM_FAULT_FALLBACK; } @@ -1309,7 +1309,7 @@ static vm_fault_t dax_fault_return(int error) * flushed on write-faults (non-cow), but not read-faults. */ static bool dax_fault_is_synchronous(unsigned long flags, - struct vm_area_struct *vma, struct iomap *iomap) + struct vm_area_struct *vma, const struct iomap *iomap) { return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) && (iomap->flags & IOMAP_F_DIRTY); @@ -1329,22 +1329,22 @@ static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn) return VM_FAULT_NEEDDSYNC; } -static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf, struct iomap *iomap, - loff_t pos) +static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf, + const struct iomap_iter *iter) { - sector_t sector = dax_iomap_sector(iomap, pos); + sector_t sector = dax_iomap_sector(&iter->iomap, iter->pos); unsigned long vaddr = vmf->address; vm_fault_t ret; int error = 0; - switch (iomap->type) { + switch (iter->iomap.type) { case IOMAP_HOLE: case IOMAP_UNWRITTEN: clear_user_highpage(vmf->cow_page, vaddr); break; case IOMAP_MAPPED: - error = copy_cow_page_dax(iomap->bdev, iomap->dax_dev, sector, - vmf->cow_page, vaddr); + error = copy_cow_page_dax(iter->iomap.bdev, iter->iomap.dax_dev, + sector, vmf->cow_page, vaddr); break; default: WARN_ON_ONCE(1); @@ -1363,29 +1363,31 @@ static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf, struct iomap *iomap, } /** - * dax_fault_actor - Common actor to handle pfn insertion in PTE/PMD fault. + * dax_fault_iter - Common actor to handle pfn insertion in PTE/PMD fault. * @vmf: vm fault instance + * @iter: iomap iter * @pfnp: pfn to be returned * @xas: the dax mapping tree of a file * @entry: an unlocked dax entry to be inserted * @pmd: distinguish whether it is a pmd fault - * @flags: iomap flags - * @iomap: from iomap_begin() - * @srcmap: from iomap_begin(), not equal to iomap if it is a CoW */ -static vm_fault_t dax_fault_actor(struct vm_fault *vmf, pfn_t *pfnp, - struct xa_state *xas, void **entry, bool pmd, - unsigned int flags, struct iomap *iomap, struct iomap *srcmap) +static vm_fault_t dax_fault_iter(struct vm_fault *vmf, + const struct iomap_iter *iter, pfn_t *pfnp, + struct xa_state *xas, void **entry, bool pmd) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; + const struct iomap *iomap = &iter->iomap; size_t size = pmd ? PMD_SIZE : PAGE_SIZE; loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT; bool write = vmf->flags & FAULT_FLAG_WRITE; - bool sync = dax_fault_is_synchronous(flags, vmf->vma, iomap); + bool sync = dax_fault_is_synchronous(iter->flags, vmf->vma, iomap); unsigned long entry_flags = pmd ? DAX_PMD : 0; int err = 0; pfn_t pfn; + if (!pmd && vmf->cow_page) + return dax_fault_cow_page(vmf, iter); + /* if we are reading UNWRITTEN and HOLE, return a hole. */ if (!write && (iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) { @@ -1399,7 +1401,7 @@ static vm_fault_t dax_fault_actor(struct vm_fault *vmf, pfn_t *pfnp, return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS; } - err = dax_iomap_pfn(iomap, pos, size, &pfn); + err = dax_iomap_pfn(&iter->iomap, pos, size, &pfn); if (err) return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err); @@ -1422,32 +1424,31 @@ static vm_fault_t dax_fault_actor(struct vm_fault *vmf, pfn_t *pfnp, static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops) { - struct vm_area_struct *vma = vmf->vma; - struct address_space *mapping = vma->vm_file->f_mapping; + struct address_space *mapping = vmf->vma->vm_file->f_mapping; XA_STATE(xas, &mapping->i_pages, vmf->pgoff); - struct inode *inode = mapping->host; - loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; - struct iomap iomap = { .type = IOMAP_HOLE }; - struct iomap srcmap = { .type = IOMAP_HOLE }; - unsigned flags = IOMAP_FAULT; - int error; - bool write = vmf->flags & FAULT_FLAG_WRITE; - vm_fault_t ret = 0, major = 0; + struct iomap_iter iter = { + .inode = mapping->host, + .pos = (loff_t)vmf->pgoff << PAGE_SHIFT, + .len = PAGE_SIZE, + .flags = IOMAP_FAULT, + }; + vm_fault_t ret = 0; void *entry; + int error; - trace_dax_pte_fault(inode, vmf, ret); + trace_dax_pte_fault(iter.inode, vmf, ret); /* * Check whether offset isn't beyond end of file now. Caller is supposed * to hold locks serializing us with truncate / punch hole so this is * a reliable test. */ - if (pos >= i_size_read(inode)) { + if (iter.pos >= i_size_read(iter.inode)) { ret = VM_FAULT_SIGBUS; goto out; } - if (write && !vmf->cow_page) - flags |= IOMAP_WRITE; + if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) + iter.flags |= IOMAP_WRITE; entry = grab_mapping_entry(&xas, mapping, 0); if (xa_is_internal(entry)) { @@ -1466,59 +1467,34 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, goto unlock_entry; } - /* - * Note that we don't bother to use iomap_iter here: DAX required - * the file system block size to be equal the page size, which means - * that we never have to deal with more than a single extent here. - */ - error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap, &srcmap); + while ((error = iomap_iter(&iter, ops)) > 0) { + if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) { + iter.processed = -EIO; /* fs corruption? */ + continue; + } + + ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, false); + if (ret != VM_FAULT_SIGBUS && + (iter.iomap.flags & IOMAP_F_NEW)) { + count_vm_event(PGMAJFAULT); + count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); + ret |= VM_FAULT_MAJOR; + } + + if (!(ret & VM_FAULT_ERROR)) + iter.processed = PAGE_SIZE; + } + if (iomap_errp) *iomap_errp = error; - if (error) { + if (!ret && error) ret = dax_fault_return(error); - goto unlock_entry; - } - if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { - ret = VM_FAULT_SIGBUS; /* fs corruption? */ - goto finish_iomap; - } - if (vmf->cow_page) { - ret = dax_fault_cow_page(vmf, &iomap, pos); - goto finish_iomap; - } - - ret = dax_fault_actor(vmf, pfnp, &xas, &entry, false, flags, - &iomap, &srcmap); - if (ret == VM_FAULT_SIGBUS) - goto finish_iomap; - - /* read/write MAPPED, CoW UNWRITTEN */ - if (iomap.flags & IOMAP_F_NEW) { - count_vm_event(PGMAJFAULT); - count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); - major = VM_FAULT_MAJOR; - } - -finish_iomap: - if (ops->iomap_end) { - int copied = PAGE_SIZE; - - if (ret & VM_FAULT_ERROR) - copied = 0; - /* - * The fault is done by now and there's no way back (other - * thread may be already happily using PTE we have installed). - * Just ignore error from ->iomap_end since we cannot do much - * with it. - */ - ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); - } unlock_entry: dax_unlock_entry(&xas, entry); out: - trace_dax_pte_fault_done(inode, vmf, ret); - return ret | major; + trace_dax_pte_fault_done(iter.inode, vmf, ret); + return ret; } #ifdef CONFIG_FS_DAX_PMD @@ -1558,28 +1534,29 @@ static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas, static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { - struct vm_area_struct *vma = vmf->vma; - struct address_space *mapping = vma->vm_file->f_mapping; + struct address_space *mapping = vmf->vma->vm_file->f_mapping; XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER); - bool write = vmf->flags & FAULT_FLAG_WRITE; - unsigned int flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; - struct inode *inode = mapping->host; + struct iomap_iter iter = { + .inode = mapping->host, + .len = PMD_SIZE, + .flags = IOMAP_FAULT, + }; vm_fault_t ret = VM_FAULT_FALLBACK; - struct iomap iomap = { .type = IOMAP_HOLE }; - struct iomap srcmap = { .type = IOMAP_HOLE }; pgoff_t max_pgoff; void *entry; - loff_t pos; int error; + if (vmf->flags & FAULT_FLAG_WRITE) + iter.flags |= IOMAP_WRITE; + /* * Check whether offset isn't beyond end of file now. Caller is * supposed to hold locks serializing us with truncate / punch hole so * this is a reliable test. */ - max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + max_pgoff = DIV_ROUND_UP(i_size_read(iter.inode), PAGE_SIZE); - trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); + trace_dax_pmd_fault(iter.inode, vmf, max_pgoff, 0); if (xas.xa_index >= max_pgoff) { ret = VM_FAULT_SIGBUS; @@ -1613,45 +1590,25 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, goto unlock_entry; } - /* - * Note that we don't use iomap_iter here. We aren't doing I/O, only - * setting up a mapping, so really we're using iomap_begin() as a way - * to look up our filesystem block. - */ - pos = (loff_t)xas.xa_index << PAGE_SHIFT; - error = ops->iomap_begin(inode, pos, PMD_SIZE, flags, &iomap, &srcmap); - if (error) - goto unlock_entry; + iter.pos = (loff_t)xas.xa_index << PAGE_SHIFT; + while ((error = iomap_iter(&iter, ops)) > 0) { + if (iomap_length(&iter) < PMD_SIZE) + continue; /* actually breaks out of the loop */ - if (iomap.offset + iomap.length < pos + PMD_SIZE) - goto finish_iomap; - - ret = dax_fault_actor(vmf, pfnp, &xas, &entry, true, flags, - &iomap, &srcmap); - -finish_iomap: - if (ops->iomap_end) { - int copied = PMD_SIZE; - - if (ret == VM_FAULT_FALLBACK) - copied = 0; - /* - * The fault is done by now and there's no way back (other - * thread may be already happily using PMD we have installed). - * Just ignore error from ->iomap_end since we cannot do much - * with it. - */ - ops->iomap_end(inode, pos, PMD_SIZE, copied, flags, &iomap); + ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true); + if (ret != VM_FAULT_FALLBACK) + iter.processed = PMD_SIZE; } + unlock_entry: dax_unlock_entry(&xas, entry); fallback: if (ret == VM_FAULT_FALLBACK) { - split_huge_pmd(vma, vmf->pmd, vmf->address); + split_huge_pmd(vmf->vma, vmf->pmd, vmf->address); count_vm_event(THP_FAULT_FALLBACK); } out: - trace_dax_pmd_fault_done(inode, vmf, max_pgoff, ret); + trace_dax_pmd_fault_done(iter.inode, vmf, max_pgoff, ret); return ret; } #else From fad0a1ab34f777bd8a95c6cebd70ee899b6e159e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Aug 2021 18:33:16 -0700 Subject: [PATCH 38/41] iomap: constify iomap_iter_srcmap The srcmap returned from iomap_iter_srcmap is never modified, so mark the iomap returned from it const and constify a lot of code that never modifies the iomap. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 38 +++++++++++++++++++------------------- include/linux/iomap.h | 2 +- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index a0ef7ebe9209..9cc5798423d1 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -205,10 +205,10 @@ struct iomap_readpage_ctx { struct readahead_control *rac; }; -static loff_t iomap_read_inline_data(struct iomap_iter *iter, +static loff_t iomap_read_inline_data(const struct iomap_iter *iter, struct page *page) { - struct iomap *iomap = iomap_iter_srcmap(iter); + const struct iomap *iomap = iomap_iter_srcmap(iter); size_t size = i_size_read(iter->inode) - iomap->offset; size_t poff = offset_in_page(iomap->offset); void *addr; @@ -234,20 +234,20 @@ static loff_t iomap_read_inline_data(struct iomap_iter *iter, return PAGE_SIZE - poff; } -static inline bool iomap_block_needs_zeroing(struct iomap_iter *iter, +static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter, loff_t pos) { - struct iomap *srcmap = iomap_iter_srcmap(iter); + const struct iomap *srcmap = iomap_iter_srcmap(iter); return srcmap->type != IOMAP_MAPPED || (srcmap->flags & IOMAP_F_NEW) || pos >= i_size_read(iter->inode); } -static loff_t iomap_readpage_iter(struct iomap_iter *iter, +static loff_t iomap_readpage_iter(const struct iomap_iter *iter, struct iomap_readpage_ctx *ctx, loff_t offset) { - struct iomap *iomap = &iter->iomap; + const struct iomap *iomap = &iter->iomap; loff_t pos = iter->pos + offset; loff_t length = iomap_length(iter) - offset; struct page *page = ctx->cur_page; @@ -352,7 +352,7 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops) } EXPORT_SYMBOL_GPL(iomap_readpage); -static loff_t iomap_readahead_iter(struct iomap_iter *iter, +static loff_t iomap_readahead_iter(const struct iomap_iter *iter, struct iomap_readpage_ctx *ctx) { loff_t length = iomap_length(iter); @@ -536,10 +536,10 @@ iomap_read_page_sync(loff_t block_start, struct page *page, unsigned poff, return submit_bio_wait(&bio); } -static int __iomap_write_begin(struct iomap_iter *iter, loff_t pos, +static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, unsigned len, struct page *page) { - struct iomap *srcmap = iomap_iter_srcmap(iter); + const struct iomap *srcmap = iomap_iter_srcmap(iter); struct iomap_page *iop = iomap_page_create(iter->inode, page); loff_t block_size = i_blocksize(iter->inode); loff_t block_start = round_down(pos, block_size); @@ -577,7 +577,7 @@ static int __iomap_write_begin(struct iomap_iter *iter, loff_t pos, return 0; } -static int iomap_write_begin_inline(struct iomap_iter *iter, +static int iomap_write_begin_inline(const struct iomap_iter *iter, struct page *page) { int ret; @@ -591,11 +591,11 @@ static int iomap_write_begin_inline(struct iomap_iter *iter, return 0; } -static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, unsigned len, - struct page **pagep) +static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, + unsigned len, struct page **pagep) { const struct iomap_page_ops *page_ops = iter->iomap.page_ops; - struct iomap *srcmap = iomap_iter_srcmap(iter); + const struct iomap *srcmap = iomap_iter_srcmap(iter); struct page *page; int status = 0; @@ -666,10 +666,10 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, return copied; } -static size_t iomap_write_end_inline(struct iomap_iter *iter, struct page *page, - loff_t pos, size_t copied) +static size_t iomap_write_end_inline(const struct iomap_iter *iter, + struct page *page, loff_t pos, size_t copied) { - struct iomap *iomap = &iter->iomap; + const struct iomap *iomap = &iter->iomap; void *addr; WARN_ON_ONCE(!PageUptodate(page)); @@ -689,7 +689,7 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, size_t copied, struct page *page) { const struct iomap_page_ops *page_ops = iter->iomap.page_ops; - struct iomap *srcmap = iomap_iter_srcmap(iter); + const struct iomap *srcmap = iomap_iter_srcmap(iter); loff_t old_size = iter->inode->i_size; size_t ret; @@ -814,7 +814,7 @@ EXPORT_SYMBOL_GPL(iomap_file_buffered_write); static loff_t iomap_unshare_iter(struct iomap_iter *iter) { struct iomap *iomap = &iter->iomap; - struct iomap *srcmap = iomap_iter_srcmap(iter); + const struct iomap *srcmap = iomap_iter_srcmap(iter); loff_t pos = iter->pos; loff_t length = iomap_length(iter); long status = 0; @@ -890,7 +890,7 @@ static s64 __iomap_zero_iter(struct iomap_iter *iter, loff_t pos, u64 length) static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) { struct iomap *iomap = &iter->iomap; - struct iomap *srcmap = iomap_iter_srcmap(iter); + const struct iomap *srcmap = iomap_iter_srcmap(iter); loff_t pos = iter->pos; loff_t length = iomap_length(iter); loff_t written = 0; diff --git a/include/linux/iomap.h b/include/linux/iomap.h index f53c40e9d799..24f8489583ca 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -211,7 +211,7 @@ static inline u64 iomap_length(const struct iomap_iter *iter) * for a given operation, which may or may no be identical to the destination * map in &i->iomap. */ -static inline struct iomap *iomap_iter_srcmap(struct iomap_iter *i) +static inline const struct iomap *iomap_iter_srcmap(const struct iomap_iter *i) { if (i->srcmap.type != IOMAP_HOLE) return &i->srcmap; From 8d04fbe71fa06bb3671f449026178adfbf45dc74 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 11 Aug 2021 08:26:56 -0700 Subject: [PATCH 39/41] iomap: move loop control code to iter.c Now that we've moved iomap to the iterator model, rename this file to be in sync with the functions contained inside of it. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/iomap/Makefile | 2 +- fs/iomap/{apply.c => iter.c} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename fs/iomap/{apply.c => iter.c} (100%) diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile index eef2722d93a1..4143a3ff89db 100644 --- a/fs/iomap/Makefile +++ b/fs/iomap/Makefile @@ -9,9 +9,9 @@ ccflags-y += -I $(srctree)/$(src) # needed for trace events obj-$(CONFIG_FS_IOMAP) += iomap.o iomap-y += trace.o \ - apply.o \ buffered-io.o \ direct-io.o \ fiemap.o \ + iter.o \ seek.o iomap-$(CONFIG_SWAP) += swapfile.o diff --git a/fs/iomap/apply.c b/fs/iomap/iter.c similarity index 100% rename from fs/iomap/apply.c rename to fs/iomap/iter.c From 36ca7943ac18aebf8aad4c50829eb2ea5ec847df Mon Sep 17 00:00:00 2001 From: Xu Yu Date: Wed, 18 Aug 2021 12:47:52 -0700 Subject: [PATCH 40/41] mm/swap: consider max pages in iomap_swapfile_add_extent When the max pages (last_page in the swap header + 1) is smaller than the total pages (inode size) of the swapfile, iomap_swapfile_activate overwrites sis->max with total pages. However, frontswap_map is a swap page state bitmap allocated using the initial sis->max page count read from the swap header. If swapfile activation increases sis->max, it's possible for the frontswap code to walk off the end of the bitmap, thereby corrupting kernel memory. [djwong: modify the description a bit; the original paragraph reads: "However, frontswap_map is allocated using max pages. When test and clear the sis offset, which is larger than max pages, of frontswap_map in __frontswap_invalidate_page(), neighbors of frontswap_map may be overwritten, i.e., slab is polluted." Note also that this bug resulted in a behavioral change: activating a swap file that was formatted and later extended results in all pages being activated, not the number of pages recorded in the swap header.] This fixes the issue by considering the limitation of max pages of swap info in iomap_swapfile_add_extent(). To reproduce the case, compile kernel with slub RED ZONE, then run test: $ sudo stress-ng -a 1 -x softlockup,resources -t 72h --metrics --times \ --verify -v -Y /root/tmpdir/stress-ng/stress-statistic-12.yaml \ --log-file /root/tmpdir/stress-ng/stress-logfile-12.txt \ --temp-path /root/tmpdir/stress-ng/ We'll get the error log as below: [ 1151.015141] ============================================================================= [ 1151.016489] BUG kmalloc-16 (Not tainted): Right Redzone overwritten [ 1151.017486] ----------------------------------------------------------------------------- [ 1151.017486] [ 1151.018997] Disabling lock debugging due to kernel taint [ 1151.019873] INFO: 0x0000000084e43932-0x0000000098d17cae @offset=7392. First byte 0x0 instead of 0xcc [ 1151.021303] INFO: Allocated in __do_sys_swapon+0xcf6/0x1170 age=43417 cpu=9 pid=3816 [ 1151.022538] __slab_alloc+0xe/0x20 [ 1151.023069] __kmalloc_node+0xfd/0x4b0 [ 1151.023704] __do_sys_swapon+0xcf6/0x1170 [ 1151.024346] do_syscall_64+0x33/0x40 [ 1151.024925] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 1151.025749] INFO: Freed in put_cred_rcu+0xa1/0xc0 age=43424 cpu=3 pid=2041 [ 1151.026889] kfree+0x276/0x2b0 [ 1151.027405] put_cred_rcu+0xa1/0xc0 [ 1151.027949] rcu_do_batch+0x17d/0x410 [ 1151.028566] rcu_core+0x14e/0x2b0 [ 1151.029084] __do_softirq+0x101/0x29e [ 1151.029645] asm_call_irq_on_stack+0x12/0x20 [ 1151.030381] do_softirq_own_stack+0x37/0x40 [ 1151.031037] do_softirq.part.15+0x2b/0x30 [ 1151.031710] __local_bh_enable_ip+0x4b/0x50 [ 1151.032412] copy_fpstate_to_sigframe+0x111/0x360 [ 1151.033197] __setup_rt_frame+0xce/0x480 [ 1151.033809] arch_do_signal+0x1a3/0x250 [ 1151.034463] exit_to_user_mode_prepare+0xcf/0x110 [ 1151.035242] syscall_exit_to_user_mode+0x27/0x190 [ 1151.035970] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 1151.036795] INFO: Slab 0x000000003b9de4dc objects=44 used=9 fp=0x00000000539e349e flags=0xfffffc0010201 [ 1151.038323] INFO: Object 0x000000004855ba01 @offset=7376 fp=0x0000000000000000 [ 1151.038323] [ 1151.039683] Redzone 000000008d0afd3d: cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc ................ [ 1151.041180] Object 000000004855ba01: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ [ 1151.042714] Redzone 0000000084e43932: 00 00 00 c0 cc cc cc cc ........ [ 1151.044120] Padding 000000000864c042: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZZZZZZZZZ [ 1151.045615] CPU: 5 PID: 3816 Comm: stress-ng Tainted: G B 5.10.50+ #7 [ 1151.046846] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 [ 1151.048633] Call Trace: [ 1151.049072] dump_stack+0x57/0x6a [ 1151.049585] check_bytes_and_report+0xed/0x110 [ 1151.050320] check_object+0x1eb/0x290 [ 1151.050924] ? __x64_sys_swapoff+0x39a/0x540 [ 1151.051646] free_debug_processing+0x151/0x350 [ 1151.052333] __slab_free+0x21a/0x3a0 [ 1151.052938] ? _cond_resched+0x2d/0x40 [ 1151.053529] ? __vunmap+0x1de/0x220 [ 1151.054139] ? __x64_sys_swapoff+0x39a/0x540 [ 1151.054796] ? kfree+0x276/0x2b0 [ 1151.055307] kfree+0x276/0x2b0 [ 1151.055832] __x64_sys_swapoff+0x39a/0x540 [ 1151.056466] do_syscall_64+0x33/0x40 [ 1151.057084] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 1151.057866] RIP: 0033:0x150340b0ffb7 [ 1151.058481] Code: Unable to access opcode bytes at RIP 0x150340b0ff8d. [ 1151.059537] RSP: 002b:00007fff7f4ee238 EFLAGS: 00000246 ORIG_RAX: 00000000000000a8 [ 1151.060768] RAX: ffffffffffffffda RBX: 00007fff7f4ee66c RCX: 0000150340b0ffb7 [ 1151.061904] RDX: 000000000000000a RSI: 0000000000018094 RDI: 00007fff7f4ee860 [ 1151.063033] RBP: 00007fff7f4ef980 R08: 0000000000000000 R09: 0000150340a672bd [ 1151.064135] R10: 00007fff7f4edca0 R11: 0000000000000246 R12: 0000000000018094 [ 1151.065253] R13: 0000000000000005 R14: 000000000160d930 R15: 00007fff7f4ee66c [ 1151.066413] FIX kmalloc-16: Restoring 0x0000000084e43932-0x0000000098d17cae=0xcc [ 1151.066413] [ 1151.067890] FIX kmalloc-16: Object at 0x000000004855ba01 not freed Fixes: 67482129cdab ("iomap: add a swapfile activation function") Fixes: a45c0eccc564 ("iomap: move the swapfile code into a separate file") Signed-off-by: Gang Deng Signed-off-by: Xu Yu Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/iomap/swapfile.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c index 7069606eca85..5fc0ac36dee3 100644 --- a/fs/iomap/swapfile.c +++ b/fs/iomap/swapfile.c @@ -31,11 +31,16 @@ static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi) { struct iomap *iomap = &isi->iomap; unsigned long nr_pages; + unsigned long max_pages; uint64_t first_ppage; uint64_t first_ppage_reported; uint64_t next_ppage; int error; + if (unlikely(isi->nr_pages >= isi->sis->max)) + return 0; + max_pages = isi->sis->max - isi->nr_pages; + /* * Round the start up and the end down so that the physical * extent aligns to a page boundary. @@ -48,6 +53,7 @@ static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi) if (first_ppage >= next_ppage) return 0; nr_pages = next_ppage - first_ppage; + nr_pages = min(nr_pages, max_pages); /* * Calculate how much swap space we're adding; the first page contains From 03b8df8d43ecc3c5724e6bfb80bc0b9ea2aa2612 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sat, 21 Aug 2021 19:24:53 -0700 Subject: [PATCH 41/41] iomap: standardize tracepoint formatting and storage Print all the offset, pos, and length quantities in hexadecimal. While we're at it, update the types of the tracepoint structure fields to match the types of the values being recorded in them. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Dave Chinner --- fs/iomap/trace.h | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index f1519f9a1403..65e39785c284 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -4,6 +4,15 @@ * * NOTE: none of these tracepoints shall be considered a stable kernel ABI * as they can change at any time. + * + * Current conventions for printing numbers measuring specific units: + * + * offset: byte offset into a subcomponent of a file operation + * pos: file offset, in bytes + * length: length of a file operation, in bytes + * ino: inode number + * + * Numbers describing space allocations should be formatted in hexadecimal. */ #undef TRACE_SYSTEM #define TRACE_SYSTEM iomap @@ -42,14 +51,14 @@ DEFINE_READPAGE_EVENT(iomap_readpage); DEFINE_READPAGE_EVENT(iomap_readahead); DECLARE_EVENT_CLASS(iomap_range_class, - TP_PROTO(struct inode *inode, unsigned long off, unsigned int len), + TP_PROTO(struct inode *inode, loff_t off, u64 len), TP_ARGS(inode, off, len), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) __field(loff_t, size) - __field(unsigned long, offset) - __field(unsigned int, length) + __field(loff_t, offset) + __field(u64, length) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; @@ -58,8 +67,7 @@ DECLARE_EVENT_CLASS(iomap_range_class, __entry->offset = off; __entry->length = len; ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset %lx " - "length %x", + TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx length 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, @@ -69,7 +77,7 @@ DECLARE_EVENT_CLASS(iomap_range_class, #define DEFINE_RANGE_EVENT(name) \ DEFINE_EVENT(iomap_range_class, name, \ - TP_PROTO(struct inode *inode, unsigned long off, unsigned int len),\ + TP_PROTO(struct inode *inode, loff_t off, u64 len),\ TP_ARGS(inode, off, len)) DEFINE_RANGE_EVENT(iomap_writepage); DEFINE_RANGE_EVENT(iomap_releasepage); @@ -122,8 +130,8 @@ DECLARE_EVENT_CLASS(iomap_class, __entry->flags = iomap->flags; __entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0; ), - TP_printk("dev %d:%d ino 0x%llx bdev %d:%d addr %lld offset %lld " - "length %llu type %s flags %s", + TP_printk("dev %d:%d ino 0x%llx bdev %d:%d addr 0x%llx offset 0x%llx " + "length 0x%llx type %s flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, MAJOR(__entry->bdev), MINOR(__entry->bdev), @@ -149,7 +157,7 @@ TRACE_EVENT(iomap_iter, __field(dev_t, dev) __field(u64, ino) __field(loff_t, pos) - __field(loff_t, length) + __field(u64, length) __field(unsigned int, flags) __field(const void *, ops) __field(unsigned long, caller) @@ -163,7 +171,7 @@ TRACE_EVENT(iomap_iter, __entry->ops = ops; __entry->caller = caller; ), - TP_printk("dev %d:%d ino 0x%llx pos %lld length %lld flags %s (0x%x) ops %ps caller %pS", + TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx flags %s (0x%x) ops %ps caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->pos,