2019-05-24 18:04:05 +08:00
|
|
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
2021-05-07 09:06:44 +08:00
|
|
|
|
/*
|
2005-12-16 06:31:24 +08:00
|
|
|
|
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include <linux/fs.h>
|
|
|
|
|
#include <linux/slab.h>
|
|
|
|
|
#include <linux/highmem.h>
|
|
|
|
|
#include <linux/pagemap.h>
|
|
|
|
|
#include <asm/byteorder.h>
|
2007-02-10 12:24:12 +08:00
|
|
|
|
#include <linux/swap.h>
|
2007-10-31 03:08:32 +08:00
|
|
|
|
#include <linux/mpage.h>
|
2008-10-10 01:38:40 +08:00
|
|
|
|
#include <linux/quotaops.h>
|
2015-02-17 08:00:00 +08:00
|
|
|
|
#include <linux/blkdev.h>
|
2015-02-23 00:58:50 +08:00
|
|
|
|
#include <linux/uio.h>
|
2019-01-04 07:29:02 +08:00
|
|
|
|
#include <linux/mm.h>
|
2005-12-16 06:31:24 +08:00
|
|
|
|
|
|
|
|
|
#include <cluster/masklog.h>
|
|
|
|
|
|
|
|
|
|
#include "ocfs2.h"
|
|
|
|
|
|
|
|
|
|
#include "alloc.h"
|
|
|
|
|
#include "aops.h"
|
|
|
|
|
#include "dlmglue.h"
|
|
|
|
|
#include "extent_map.h"
|
|
|
|
|
#include "file.h"
|
|
|
|
|
#include "inode.h"
|
|
|
|
|
#include "journal.h"
|
2007-02-10 12:24:12 +08:00
|
|
|
|
#include "suballoc.h"
|
2005-12-16 06:31:24 +08:00
|
|
|
|
#include "super.h"
|
|
|
|
|
#include "symlink.h"
|
2009-08-25 08:02:48 +08:00
|
|
|
|
#include "refcounttree.h"
|
2011-02-22 21:33:59 +08:00
|
|
|
|
#include "ocfs2_trace.h"
|
2005-12-16 06:31:24 +08:00
|
|
|
|
|
|
|
|
|
#include "buffer_head_io.h"
|
2015-02-17 08:00:00 +08:00
|
|
|
|
#include "dir.h"
|
|
|
|
|
#include "namei.h"
|
|
|
|
|
#include "sysfile.h"
|
2005-12-16 06:31:24 +08:00
|
|
|
|
|
|
|
|
|
static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
|
|
|
|
|
struct buffer_head *bh_result, int create)
|
|
|
|
|
{
|
|
|
|
|
int err = -EIO;
|
|
|
|
|
int status;
|
|
|
|
|
struct ocfs2_dinode *fe = NULL;
|
|
|
|
|
struct buffer_head *bh = NULL;
|
|
|
|
|
struct buffer_head *buffer_cache_bh = NULL;
|
|
|
|
|
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
|
|
|
|
|
void *kaddr;
|
|
|
|
|
|
2011-02-22 21:33:59 +08:00
|
|
|
|
trace_ocfs2_symlink_get_block(
|
|
|
|
|
(unsigned long long)OCFS2_I(inode)->ip_blkno,
|
|
|
|
|
(unsigned long long)iblock, bh_result, create);
|
2005-12-16 06:31:24 +08:00
|
|
|
|
|
|
|
|
|
BUG_ON(ocfs2_inode_is_fast_symlink(inode));
|
|
|
|
|
|
|
|
|
|
if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
|
|
|
|
|
mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
|
|
|
|
|
(unsigned long long)iblock);
|
|
|
|
|
goto bail;
|
|
|
|
|
}
|
|
|
|
|
|
2008-11-14 06:49:11 +08:00
|
|
|
|
status = ocfs2_read_inode_block(inode, &bh);
|
2005-12-16 06:31:24 +08:00
|
|
|
|
if (status < 0) {
|
|
|
|
|
mlog_errno(status);
|
|
|
|
|
goto bail;
|
|
|
|
|
}
|
|
|
|
|
fe = (struct ocfs2_dinode *) bh->b_data;
|
|
|
|
|
|
|
|
|
|
if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
|
|
|
|
|
le32_to_cpu(fe->i_clusters))) {
|
2013-11-13 07:06:54 +08:00
|
|
|
|
err = -ENOMEM;
|
2005-12-16 06:31:24 +08:00
|
|
|
|
mlog(ML_ERROR, "block offset is outside the allocated size: "
|
|
|
|
|
"%llu\n", (unsigned long long)iblock);
|
|
|
|
|
goto bail;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* We don't use the page cache to create symlink data, so if
|
|
|
|
|
* need be, copy it over from the buffer cache. */
|
|
|
|
|
if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
|
|
|
|
|
u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
|
|
|
|
|
iblock;
|
|
|
|
|
buffer_cache_bh = sb_getblk(osb->sb, blkno);
|
|
|
|
|
if (!buffer_cache_bh) {
|
2013-11-13 07:06:54 +08:00
|
|
|
|
err = -ENOMEM;
|
2005-12-16 06:31:24 +08:00
|
|
|
|
mlog(ML_ERROR, "couldn't getblock for symlink!\n");
|
|
|
|
|
goto bail;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* we haven't locked out transactions, so a commit
|
|
|
|
|
* could've happened. Since we've got a reference on
|
|
|
|
|
* the bh, even if it commits while we're doing the
|
|
|
|
|
* copy, the data is still good. */
|
|
|
|
|
if (buffer_jbd(buffer_cache_bh)
|
|
|
|
|
&& ocfs2_inode_is_new(inode)) {
|
2011-11-25 23:14:34 +08:00
|
|
|
|
kaddr = kmap_atomic(bh_result->b_page);
|
2005-12-16 06:31:24 +08:00
|
|
|
|
if (!kaddr) {
|
|
|
|
|
mlog(ML_ERROR, "couldn't kmap!\n");
|
|
|
|
|
goto bail;
|
|
|
|
|
}
|
|
|
|
|
memcpy(kaddr + (bh_result->b_size * iblock),
|
|
|
|
|
buffer_cache_bh->b_data,
|
|
|
|
|
bh_result->b_size);
|
2011-11-25 23:14:34 +08:00
|
|
|
|
kunmap_atomic(kaddr);
|
2005-12-16 06:31:24 +08:00
|
|
|
|
set_buffer_uptodate(bh_result);
|
|
|
|
|
}
|
|
|
|
|
brelse(buffer_cache_bh);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
map_bh(bh_result, inode->i_sb,
|
|
|
|
|
le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
|
|
|
|
|
|
|
|
|
|
err = 0;
|
|
|
|
|
|
|
|
|
|
bail:
|
2008-10-08 05:25:16 +08:00
|
|
|
|
brelse(bh);
|
2005-12-16 06:31:24 +08:00
|
|
|
|
|
|
|
|
|
return err;
|
|
|
|
|
}
|
|
|
|
|
|
2017-11-16 09:31:44 +08:00
|
|
|
|
static int ocfs2_lock_get_block(struct inode *inode, sector_t iblock,
|
|
|
|
|
struct buffer_head *bh_result, int create)
|
|
|
|
|
{
|
|
|
|
|
int ret = 0;
|
|
|
|
|
struct ocfs2_inode_info *oi = OCFS2_I(inode);
|
|
|
|
|
|
|
|
|
|
down_read(&oi->ip_alloc_sem);
|
|
|
|
|
ret = ocfs2_get_block(inode, iblock, bh_result, create);
|
|
|
|
|
up_read(&oi->ip_alloc_sem);
|
|
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2009-08-25 08:05:12 +08:00
|
|
|
|
int ocfs2_get_block(struct inode *inode, sector_t iblock,
|
|
|
|
|
struct buffer_head *bh_result, int create)
|
2005-12-16 06:31:24 +08:00
|
|
|
|
{
|
|
|
|
|
int err = 0;
|
2007-03-10 08:21:46 +08:00
|
|
|
|
unsigned int ext_flags;
|
2007-10-31 03:08:32 +08:00
|
|
|
|
u64 max_blocks = bh_result->b_size >> inode->i_blkbits;
|
|
|
|
|
u64 p_blkno, count, past_eof;
|
2007-02-15 07:30:30 +08:00
|
|
|
|
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
|
2005-12-16 06:31:24 +08:00
|
|
|
|
|
2011-02-22 21:33:59 +08:00
|
|
|
|
trace_ocfs2_get_block((unsigned long long)OCFS2_I(inode)->ip_blkno,
|
|
|
|
|
(unsigned long long)iblock, bh_result, create);
|
2005-12-16 06:31:24 +08:00
|
|
|
|
|
|
|
|
|
if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
|
|
|
|
|
mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
|
|
|
|
|
inode, inode->i_ino);
|
|
|
|
|
|
|
|
|
|
if (S_ISLNK(inode->i_mode)) {
|
|
|
|
|
/* this always does I/O for some reason. */
|
|
|
|
|
err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
|
|
|
|
|
goto bail;
|
|
|
|
|
}
|
|
|
|
|
|
2007-10-31 03:08:32 +08:00
|
|
|
|
err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &count,
|
2007-03-10 08:21:46 +08:00
|
|
|
|
&ext_flags);
|
2005-12-16 06:31:24 +08:00
|
|
|
|
if (err) {
|
|
|
|
|
mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
|
2006-03-04 02:24:33 +08:00
|
|
|
|
"%llu, NULL)\n", err, inode, (unsigned long long)iblock,
|
|
|
|
|
(unsigned long long)p_blkno);
|
2005-12-16 06:31:24 +08:00
|
|
|
|
goto bail;
|
|
|
|
|
}
|
|
|
|
|
|
2007-10-31 03:08:32 +08:00
|
|
|
|
if (max_blocks < count)
|
|
|
|
|
count = max_blocks;
|
|
|
|
|
|
2007-02-15 07:30:30 +08:00
|
|
|
|
/*
|
|
|
|
|
* ocfs2 never allocates in this function - the only time we
|
|
|
|
|
* need to use BH_New is when we're extending i_size on a file
|
|
|
|
|
* system which doesn't support holes, in which case BH_New
|
2010-10-06 16:47:23 +08:00
|
|
|
|
* allows __block_write_begin() to zero.
|
2008-06-30 18:45:45 +08:00
|
|
|
|
*
|
|
|
|
|
* If we see this on a sparse file system, then a truncate has
|
|
|
|
|
* raced us and removed the cluster. In this case, we clear
|
|
|
|
|
* the buffers dirty and uptodate bits and let the buffer code
|
|
|
|
|
* ignore it as a hole.
|
2007-02-15 07:30:30 +08:00
|
|
|
|
*/
|
2008-06-30 18:45:45 +08:00
|
|
|
|
if (create && p_blkno == 0 && ocfs2_sparse_alloc(osb)) {
|
|
|
|
|
clear_buffer_dirty(bh_result);
|
|
|
|
|
clear_buffer_uptodate(bh_result);
|
|
|
|
|
goto bail;
|
|
|
|
|
}
|
2007-02-15 07:30:30 +08:00
|
|
|
|
|
2007-03-10 08:21:46 +08:00
|
|
|
|
/* Treat the unwritten extent as a hole for zeroing purposes. */
|
|
|
|
|
if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
|
2007-02-15 07:30:30 +08:00
|
|
|
|
map_bh(bh_result, inode->i_sb, p_blkno);
|
|
|
|
|
|
2007-10-31 03:08:32 +08:00
|
|
|
|
bh_result->b_size = count << inode->i_blkbits;
|
|
|
|
|
|
2007-02-15 07:30:30 +08:00
|
|
|
|
if (!ocfs2_sparse_alloc(osb)) {
|
|
|
|
|
if (p_blkno == 0) {
|
|
|
|
|
err = -EIO;
|
|
|
|
|
mlog(ML_ERROR,
|
|
|
|
|
"iblock = %llu p_blkno = %llu blkno=(%llu)\n",
|
|
|
|
|
(unsigned long long)iblock,
|
|
|
|
|
(unsigned long long)p_blkno,
|
|
|
|
|
(unsigned long long)OCFS2_I(inode)->ip_blkno);
|
|
|
|
|
mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
|
|
|
|
|
dump_stack();
|
2009-07-13 11:38:58 +08:00
|
|
|
|
goto bail;
|
2007-02-15 07:30:30 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
2005-12-16 06:31:24 +08:00
|
|
|
|
|
2010-07-02 06:13:31 +08:00
|
|
|
|
past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
|
2011-02-22 21:33:59 +08:00
|
|
|
|
|
|
|
|
|
trace_ocfs2_get_block_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
|
|
|
|
|
(unsigned long long)past_eof);
|
2010-07-02 06:13:31 +08:00
|
|
|
|
if (create && (iblock >= past_eof))
|
|
|
|
|
set_buffer_new(bh_result);
|
|
|
|
|
|
2005-12-16 06:31:24 +08:00
|
|
|
|
bail:
|
|
|
|
|
if (err < 0)
|
|
|
|
|
err = -EIO;
|
|
|
|
|
|
|
|
|
|
return err;
|
|
|
|
|
}
|
|
|
|
|
|
2007-09-08 05:46:51 +08:00
|
|
|
|
int ocfs2_read_inline_data(struct inode *inode, struct page *page,
|
|
|
|
|
struct buffer_head *di_bh)
|
2007-09-08 05:05:51 +08:00
|
|
|
|
{
|
|
|
|
|
void *kaddr;
|
2007-12-19 22:24:09 +08:00
|
|
|
|
loff_t size;
|
2007-09-08 05:05:51 +08:00
|
|
|
|
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
|
|
|
|
|
|
|
|
|
|
if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
|
2015-09-05 06:44:51 +08:00
|
|
|
|
ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag\n",
|
2007-09-08 05:05:51 +08:00
|
|
|
|
(unsigned long long)OCFS2_I(inode)->ip_blkno);
|
|
|
|
|
return -EROFS;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size = i_size_read(inode);
|
|
|
|
|
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
|
if (size > PAGE_SIZE ||
|
2009-03-05 11:06:15 +08:00
|
|
|
|
size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) {
|
2007-09-08 05:05:51 +08:00
|
|
|
|
ocfs2_error(inode->i_sb,
|
2015-09-05 06:44:51 +08:00
|
|
|
|
"Inode %llu has with inline data has bad size: %Lu\n",
|
2007-12-19 22:24:09 +08:00
|
|
|
|
(unsigned long long)OCFS2_I(inode)->ip_blkno,
|
|
|
|
|
(unsigned long long)size);
|
2007-09-08 05:05:51 +08:00
|
|
|
|
return -EROFS;
|
|
|
|
|
}
|
|
|
|
|
|
2011-11-25 23:14:34 +08:00
|
|
|
|
kaddr = kmap_atomic(page);
|
2007-09-08 05:05:51 +08:00
|
|
|
|
if (size)
|
|
|
|
|
memcpy(kaddr, di->id2.i_data.id_data, size);
|
|
|
|
|
/* Clear the remaining part of the page */
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
|
memset(kaddr + size, 0, PAGE_SIZE - size);
|
2007-09-08 05:05:51 +08:00
|
|
|
|
flush_dcache_page(page);
|
2011-11-25 23:14:34 +08:00
|
|
|
|
kunmap_atomic(kaddr);
|
2007-09-08 05:05:51 +08:00
|
|
|
|
|
|
|
|
|
SetPageUptodate(page);
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
|
|
|
|
|
{
|
|
|
|
|
int ret;
|
|
|
|
|
struct buffer_head *di_bh = NULL;
|
|
|
|
|
|
|
|
|
|
BUG_ON(!PageLocked(page));
|
2008-02-27 04:45:56 +08:00
|
|
|
|
BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
|
2007-09-08 05:05:51 +08:00
|
|
|
|
|
2008-11-14 06:49:11 +08:00
|
|
|
|
ret = ocfs2_read_inode_block(inode, &di_bh);
|
2007-09-08 05:05:51 +08:00
|
|
|
|
if (ret) {
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ret = ocfs2_read_inline_data(inode, page, di_bh);
|
|
|
|
|
out:
|
|
|
|
|
unlock_page(page);
|
|
|
|
|
|
|
|
|
|
brelse(di_bh);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2022-04-29 23:12:16 +08:00
|
|
|
|
static int ocfs2_read_folio(struct file *file, struct folio *folio)
|
2005-12-16 06:31:24 +08:00
|
|
|
|
{
|
2022-04-29 23:12:16 +08:00
|
|
|
|
struct page *page = &folio->page;
|
2005-12-16 06:31:24 +08:00
|
|
|
|
struct inode *inode = page->mapping->host;
|
2007-09-08 05:05:51 +08:00
|
|
|
|
struct ocfs2_inode_info *oi = OCFS2_I(inode);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
|
loff_t start = (loff_t)page->index << PAGE_SHIFT;
|
2005-12-16 06:31:24 +08:00
|
|
|
|
int ret, unlock = 1;
|
|
|
|
|
|
2011-02-22 21:33:59 +08:00
|
|
|
|
trace_ocfs2_readpage((unsigned long long)oi->ip_blkno,
|
|
|
|
|
(page ? page->index : 0));
|
2005-12-16 06:31:24 +08:00
|
|
|
|
|
2007-10-19 06:30:42 +08:00
|
|
|
|
ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
|
2005-12-16 06:31:24 +08:00
|
|
|
|
if (ret != 0) {
|
|
|
|
|
if (ret == AOP_TRUNCATED_PAGE)
|
|
|
|
|
unlock = 0;
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
2007-09-08 05:05:51 +08:00
|
|
|
|
if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
|
2011-06-24 04:51:47 +08:00
|
|
|
|
/*
|
|
|
|
|
* Unlock the page and cycle ip_alloc_sem so that we don't
|
|
|
|
|
* busyloop waiting for ip_alloc_sem to unlock
|
|
|
|
|
*/
|
2007-05-15 02:38:51 +08:00
|
|
|
|
ret = AOP_TRUNCATED_PAGE;
|
2011-06-24 04:51:47 +08:00
|
|
|
|
unlock_page(page);
|
|
|
|
|
unlock = 0;
|
|
|
|
|
down_read(&oi->ip_alloc_sem);
|
|
|
|
|
up_read(&oi->ip_alloc_sem);
|
2007-10-19 06:30:42 +08:00
|
|
|
|
goto out_inode_unlock;
|
2007-05-15 02:38:51 +08:00
|
|
|
|
}
|
2005-12-16 06:31:24 +08:00
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* i_size might have just been updated as we grabed the meta lock. We
|
|
|
|
|
* might now be discovering a truncate that hit on another node.
|
2022-04-29 22:40:40 +08:00
|
|
|
|
* block_read_full_folio->get_block freaks out if it is asked to read
|
2005-12-16 06:31:24 +08:00
|
|
|
|
* beyond the end of a file, so we check here. Callers
|
2007-07-19 16:46:59 +08:00
|
|
|
|
* (generic_file_read, vm_ops->fault) are clever enough to check i_size
|
2005-12-16 06:31:24 +08:00
|
|
|
|
* and notice that the page they just read isn't needed.
|
|
|
|
|
*
|
|
|
|
|
* XXX sys_readahead() seems to get that wrong?
|
|
|
|
|
*/
|
|
|
|
|
if (start >= i_size_read(inode)) {
|
2008-02-05 14:28:29 +08:00
|
|
|
|
zero_user(page, 0, PAGE_SIZE);
|
2005-12-16 06:31:24 +08:00
|
|
|
|
SetPageUptodate(page);
|
|
|
|
|
ret = 0;
|
|
|
|
|
goto out_alloc;
|
|
|
|
|
}
|
|
|
|
|
|
2007-09-08 05:05:51 +08:00
|
|
|
|
if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
|
|
|
|
|
ret = ocfs2_readpage_inline(inode, page);
|
|
|
|
|
else
|
2022-04-29 22:40:40 +08:00
|
|
|
|
ret = block_read_full_folio(page_folio(page), ocfs2_get_block);
|
2005-12-16 06:31:24 +08:00
|
|
|
|
unlock = 0;
|
|
|
|
|
|
|
|
|
|
out_alloc:
|
2018-04-06 07:18:37 +08:00
|
|
|
|
up_read(&oi->ip_alloc_sem);
|
2007-10-19 06:30:42 +08:00
|
|
|
|
out_inode_unlock:
|
|
|
|
|
ocfs2_inode_unlock(inode, 0);
|
2005-12-16 06:31:24 +08:00
|
|
|
|
out:
|
|
|
|
|
if (unlock)
|
|
|
|
|
unlock_page(page);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2007-10-31 03:08:32 +08:00
|
|
|
|
/*
|
|
|
|
|
* This is used only for read-ahead. Failures or difficult to handle
|
|
|
|
|
* situations are safe to ignore.
|
|
|
|
|
*
|
|
|
|
|
* Right now, we don't bother with BH_Boundary - in-inode extent lists
|
|
|
|
|
* are quite large (243 extents on 4k blocks), so most inodes don't
|
|
|
|
|
* grow out to a tree. If need be, detecting boundary extents could
|
|
|
|
|
* trivially be added in a future version of ocfs2_get_block().
|
|
|
|
|
*/
|
fs: convert mpage_readpages to mpage_readahead
Implement the new readahead aop and convert all callers (block_dev,
exfat, ext2, fat, gfs2, hpfs, isofs, jfs, nilfs2, ocfs2, omfs, qnx6,
reiserfs & udf).
The callers are all trivial except for GFS2 & OCFS2.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com> # ocfs2
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com> # ocfs2
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Cc: Chao Yu <yuchao0@huawei.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Gao Xiang <gaoxiang25@huawei.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Link: http://lkml.kernel.org/r/20200414150233.24495-17-willy@infradead.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-02 12:47:02 +08:00
|
|
|
|
static void ocfs2_readahead(struct readahead_control *rac)
|
2007-10-31 03:08:32 +08:00
|
|
|
|
{
|
fs: convert mpage_readpages to mpage_readahead
Implement the new readahead aop and convert all callers (block_dev,
exfat, ext2, fat, gfs2, hpfs, isofs, jfs, nilfs2, ocfs2, omfs, qnx6,
reiserfs & udf).
The callers are all trivial except for GFS2 & OCFS2.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com> # ocfs2
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com> # ocfs2
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Cc: Chao Yu <yuchao0@huawei.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Gao Xiang <gaoxiang25@huawei.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Link: http://lkml.kernel.org/r/20200414150233.24495-17-willy@infradead.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-02 12:47:02 +08:00
|
|
|
|
int ret;
|
|
|
|
|
struct inode *inode = rac->mapping->host;
|
2007-10-31 03:08:32 +08:00
|
|
|
|
struct ocfs2_inode_info *oi = OCFS2_I(inode);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Use the nonblocking flag for the dlm code to avoid page
|
|
|
|
|
* lock inversion, but don't bother with retrying.
|
|
|
|
|
*/
|
|
|
|
|
ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK);
|
|
|
|
|
if (ret)
|
fs: convert mpage_readpages to mpage_readahead
Implement the new readahead aop and convert all callers (block_dev,
exfat, ext2, fat, gfs2, hpfs, isofs, jfs, nilfs2, ocfs2, omfs, qnx6,
reiserfs & udf).
The callers are all trivial except for GFS2 & OCFS2.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com> # ocfs2
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com> # ocfs2
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Cc: Chao Yu <yuchao0@huawei.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Gao Xiang <gaoxiang25@huawei.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Link: http://lkml.kernel.org/r/20200414150233.24495-17-willy@infradead.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-02 12:47:02 +08:00
|
|
|
|
return;
|
2007-10-31 03:08:32 +08:00
|
|
|
|
|
fs: convert mpage_readpages to mpage_readahead
Implement the new readahead aop and convert all callers (block_dev,
exfat, ext2, fat, gfs2, hpfs, isofs, jfs, nilfs2, ocfs2, omfs, qnx6,
reiserfs & udf).
The callers are all trivial except for GFS2 & OCFS2.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com> # ocfs2
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com> # ocfs2
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Cc: Chao Yu <yuchao0@huawei.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Gao Xiang <gaoxiang25@huawei.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Link: http://lkml.kernel.org/r/20200414150233.24495-17-willy@infradead.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-02 12:47:02 +08:00
|
|
|
|
if (down_read_trylock(&oi->ip_alloc_sem) == 0)
|
|
|
|
|
goto out_unlock;
|
2007-10-31 03:08:32 +08:00
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Don't bother with inline-data. There isn't anything
|
|
|
|
|
* to read-ahead in that case anyway...
|
|
|
|
|
*/
|
|
|
|
|
if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
|
fs: convert mpage_readpages to mpage_readahead
Implement the new readahead aop and convert all callers (block_dev,
exfat, ext2, fat, gfs2, hpfs, isofs, jfs, nilfs2, ocfs2, omfs, qnx6,
reiserfs & udf).
The callers are all trivial except for GFS2 & OCFS2.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com> # ocfs2
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com> # ocfs2
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Cc: Chao Yu <yuchao0@huawei.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Gao Xiang <gaoxiang25@huawei.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Link: http://lkml.kernel.org/r/20200414150233.24495-17-willy@infradead.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-02 12:47:02 +08:00
|
|
|
|
goto out_up;
|
2007-10-31 03:08:32 +08:00
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Check whether a remote node truncated this file - we just
|
|
|
|
|
* drop out in that case as it's not worth handling here.
|
|
|
|
|
*/
|
fs: convert mpage_readpages to mpage_readahead
Implement the new readahead aop and convert all callers (block_dev,
exfat, ext2, fat, gfs2, hpfs, isofs, jfs, nilfs2, ocfs2, omfs, qnx6,
reiserfs & udf).
The callers are all trivial except for GFS2 & OCFS2.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com> # ocfs2
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com> # ocfs2
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Cc: Chao Yu <yuchao0@huawei.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Gao Xiang <gaoxiang25@huawei.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Link: http://lkml.kernel.org/r/20200414150233.24495-17-willy@infradead.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-02 12:47:02 +08:00
|
|
|
|
if (readahead_pos(rac) >= i_size_read(inode))
|
|
|
|
|
goto out_up;
|
2007-10-31 03:08:32 +08:00
|
|
|
|
|
fs: convert mpage_readpages to mpage_readahead
Implement the new readahead aop and convert all callers (block_dev,
exfat, ext2, fat, gfs2, hpfs, isofs, jfs, nilfs2, ocfs2, omfs, qnx6,
reiserfs & udf).
The callers are all trivial except for GFS2 & OCFS2.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com> # ocfs2
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com> # ocfs2
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Cc: Chao Yu <yuchao0@huawei.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Gao Xiang <gaoxiang25@huawei.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Link: http://lkml.kernel.org/r/20200414150233.24495-17-willy@infradead.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-02 12:47:02 +08:00
|
|
|
|
mpage_readahead(rac, ocfs2_get_block);
|
2007-10-31 03:08:32 +08:00
|
|
|
|
|
fs: convert mpage_readpages to mpage_readahead
Implement the new readahead aop and convert all callers (block_dev,
exfat, ext2, fat, gfs2, hpfs, isofs, jfs, nilfs2, ocfs2, omfs, qnx6,
reiserfs & udf).
The callers are all trivial except for GFS2 & OCFS2.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com> # ocfs2
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com> # ocfs2
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Cc: Chao Yu <yuchao0@huawei.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Gao Xiang <gaoxiang25@huawei.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Link: http://lkml.kernel.org/r/20200414150233.24495-17-willy@infradead.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-02 12:47:02 +08:00
|
|
|
|
out_up:
|
2007-10-31 03:08:32 +08:00
|
|
|
|
up_read(&oi->ip_alloc_sem);
|
fs: convert mpage_readpages to mpage_readahead
Implement the new readahead aop and convert all callers (block_dev,
exfat, ext2, fat, gfs2, hpfs, isofs, jfs, nilfs2, ocfs2, omfs, qnx6,
reiserfs & udf).
The callers are all trivial except for GFS2 & OCFS2.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com> # ocfs2
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com> # ocfs2
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Cc: Chao Yu <yuchao0@huawei.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Gao Xiang <gaoxiang25@huawei.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Link: http://lkml.kernel.org/r/20200414150233.24495-17-willy@infradead.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-02 12:47:02 +08:00
|
|
|
|
out_unlock:
|
2007-10-31 03:08:32 +08:00
|
|
|
|
ocfs2_inode_unlock(inode, 0);
|
|
|
|
|
}
|
|
|
|
|
|
2005-12-16 06:31:24 +08:00
|
|
|
|
/* Note: Because we don't support holes, our allocation has
|
|
|
|
|
* already happened (allocation writes zeros to the file data)
|
|
|
|
|
* so we don't have to worry about ordered writes in
|
|
|
|
|
* ocfs2_writepage.
|
|
|
|
|
*
|
|
|
|
|
* ->writepage is called during the process of invalidating the page cache
|
|
|
|
|
* during blocked lock processing. It can't block on any cluster locks
|
|
|
|
|
* to during block mapping. It's relying on the fact that the block
|
|
|
|
|
* mapping can't have disappeared under the dirty pages that it is
|
|
|
|
|
* being asked to write back.
|
|
|
|
|
*/
|
|
|
|
|
static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
|
|
|
|
|
{
|
2011-02-22 21:33:59 +08:00
|
|
|
|
trace_ocfs2_writepage(
|
|
|
|
|
(unsigned long long)OCFS2_I(page->mapping->host)->ip_blkno,
|
|
|
|
|
page->index);
|
2005-12-16 06:31:24 +08:00
|
|
|
|
|
2011-02-22 21:33:59 +08:00
|
|
|
|
return block_write_full_page(page, ocfs2_get_block, wbc);
|
2005-12-16 06:31:24 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Taken from ext3. We don't necessarily need the full blown
|
|
|
|
|
* functionality yet, but IMHO it's better to cut and paste the whole
|
|
|
|
|
* thing so we can avoid introducing our own bugs (and easily pick up
|
|
|
|
|
* their fixes when they happen) --Mark */
|
2007-02-17 03:46:50 +08:00
|
|
|
|
int walk_page_buffers( handle_t *handle,
|
|
|
|
|
struct buffer_head *head,
|
|
|
|
|
unsigned from,
|
|
|
|
|
unsigned to,
|
|
|
|
|
int *partial,
|
|
|
|
|
int (*fn)( handle_t *handle,
|
|
|
|
|
struct buffer_head *bh))
|
2005-12-16 06:31:24 +08:00
|
|
|
|
{
|
|
|
|
|
struct buffer_head *bh;
|
|
|
|
|
unsigned block_start, block_end;
|
|
|
|
|
unsigned blocksize = head->b_size;
|
|
|
|
|
int err, ret = 0;
|
|
|
|
|
struct buffer_head *next;
|
|
|
|
|
|
|
|
|
|
for ( bh = head, block_start = 0;
|
|
|
|
|
ret == 0 && (bh != head || !block_start);
|
|
|
|
|
block_start = block_end, bh = next)
|
|
|
|
|
{
|
|
|
|
|
next = bh->b_this_page;
|
|
|
|
|
block_end = block_start + blocksize;
|
|
|
|
|
if (block_end <= from || block_start >= to) {
|
|
|
|
|
if (partial && !buffer_uptodate(bh))
|
|
|
|
|
*partial = 1;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
err = (*fn)(handle, bh);
|
|
|
|
|
if (!ret)
|
|
|
|
|
ret = err;
|
|
|
|
|
}
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
|
|
|
|
|
{
|
|
|
|
|
sector_t status;
|
|
|
|
|
u64 p_blkno = 0;
|
|
|
|
|
int err = 0;
|
|
|
|
|
struct inode *inode = mapping->host;
|
|
|
|
|
|
2011-02-22 21:33:59 +08:00
|
|
|
|
trace_ocfs2_bmap((unsigned long long)OCFS2_I(inode)->ip_blkno,
|
|
|
|
|
(unsigned long long)block);
|
2005-12-16 06:31:24 +08:00
|
|
|
|
|
2016-11-10 06:13:10 +08:00
|
|
|
|
/*
|
|
|
|
|
* The swap code (ab-)uses ->bmap to get a block mapping and then
|
|
|
|
|
* bypasseѕ the file system for actual I/O. We really can't allow
|
|
|
|
|
* that on refcounted inodes, so we have to skip out here. And yes,
|
|
|
|
|
* 0 is the magic code for a bmap error..
|
|
|
|
|
*/
|
|
|
|
|
if (ocfs2_is_refcount_inode(inode))
|
|
|
|
|
return 0;
|
|
|
|
|
|
2005-12-16 06:31:24 +08:00
|
|
|
|
/* We don't need to lock journal system files, since they aren't
|
|
|
|
|
* accessed concurrently from multiple nodes.
|
|
|
|
|
*/
|
|
|
|
|
if (!INODE_JOURNAL(inode)) {
|
2007-10-19 06:30:42 +08:00
|
|
|
|
err = ocfs2_inode_lock(inode, NULL, 0);
|
2005-12-16 06:31:24 +08:00
|
|
|
|
if (err) {
|
|
|
|
|
if (err != -ENOENT)
|
|
|
|
|
mlog_errno(err);
|
|
|
|
|
goto bail;
|
|
|
|
|
}
|
|
|
|
|
down_read(&OCFS2_I(inode)->ip_alloc_sem);
|
|
|
|
|
}
|
|
|
|
|
|
2007-09-08 05:05:51 +08:00
|
|
|
|
if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
|
|
|
|
|
err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
|
|
|
|
|
NULL);
|
2005-12-16 06:31:24 +08:00
|
|
|
|
|
|
|
|
|
if (!INODE_JOURNAL(inode)) {
|
|
|
|
|
up_read(&OCFS2_I(inode)->ip_alloc_sem);
|
2007-10-19 06:30:42 +08:00
|
|
|
|
ocfs2_inode_unlock(inode, 0);
|
2005-12-16 06:31:24 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (err) {
|
|
|
|
|
mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
|
|
|
|
|
(unsigned long long)block);
|
|
|
|
|
mlog_errno(err);
|
|
|
|
|
goto bail;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bail:
|
|
|
|
|
status = err ? 0 : p_blkno;
|
|
|
|
|
|
|
|
|
|
return status;
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-01 12:10:21 +08:00
|
|
|
|
static bool ocfs2_release_folio(struct folio *folio, gfp_t wait)
|
2007-01-05 06:54:41 +08:00
|
|
|
|
{
|
2022-05-01 12:10:21 +08:00
|
|
|
|
if (!folio_buffers(folio))
|
|
|
|
|
return false;
|
|
|
|
|
return try_to_free_buffers(&folio->page);
|
2007-01-05 06:54:41 +08:00
|
|
|
|
}
|
|
|
|
|
|
2007-02-10 12:24:12 +08:00
|
|
|
|
static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
|
|
|
|
|
u32 cpos,
|
|
|
|
|
unsigned int *start,
|
|
|
|
|
unsigned int *end)
|
|
|
|
|
{
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
|
unsigned int cluster_start = 0, cluster_end = PAGE_SIZE;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
|
if (unlikely(PAGE_SHIFT > osb->s_clustersize_bits)) {
|
2007-02-10 12:24:12 +08:00
|
|
|
|
unsigned int cpp;
|
|
|
|
|
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
|
cpp = 1 << (PAGE_SHIFT - osb->s_clustersize_bits);
|
2007-02-10 12:24:12 +08:00
|
|
|
|
|
|
|
|
|
cluster_start = cpos % cpp;
|
|
|
|
|
cluster_start = cluster_start << osb->s_clustersize_bits;
|
|
|
|
|
|
|
|
|
|
cluster_end = cluster_start + osb->s_clustersize;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
BUG_ON(cluster_start > PAGE_SIZE);
|
|
|
|
|
BUG_ON(cluster_end > PAGE_SIZE);
|
|
|
|
|
|
|
|
|
|
if (start)
|
|
|
|
|
*start = cluster_start;
|
|
|
|
|
if (end)
|
|
|
|
|
*end = cluster_end;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* 'from' and 'to' are the region in the page to avoid zeroing.
|
|
|
|
|
*
|
|
|
|
|
* If pagesize > clustersize, this function will avoid zeroing outside
|
|
|
|
|
* of the cluster boundary.
|
|
|
|
|
*
|
|
|
|
|
* from == to == 0 is code for "zero the entire cluster region"
|
|
|
|
|
*/
|
|
|
|
|
static void ocfs2_clear_page_regions(struct page *page,
|
|
|
|
|
struct ocfs2_super *osb, u32 cpos,
|
|
|
|
|
unsigned from, unsigned to)
|
|
|
|
|
{
|
|
|
|
|
void *kaddr;
|
|
|
|
|
unsigned int cluster_start, cluster_end;
|
|
|
|
|
|
|
|
|
|
ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
|
|
|
|
|
|
2011-11-25 23:14:34 +08:00
|
|
|
|
kaddr = kmap_atomic(page);
|
2007-02-10 12:24:12 +08:00
|
|
|
|
|
|
|
|
|
if (from || to) {
|
|
|
|
|
if (from > cluster_start)
|
|
|
|
|
memset(kaddr + cluster_start, 0, from - cluster_start);
|
|
|
|
|
if (to < cluster_end)
|
|
|
|
|
memset(kaddr + to, 0, cluster_end - to);
|
|
|
|
|
} else {
|
|
|
|
|
memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
|
|
|
|
|
}
|
|
|
|
|
|
2011-11-25 23:14:34 +08:00
|
|
|
|
kunmap_atomic(kaddr);
|
2007-02-10 12:24:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
2007-11-02 02:37:48 +08:00
|
|
|
|
/*
|
|
|
|
|
* Nonsparse file systems fully allocate before we get to the write
|
|
|
|
|
* code. This prevents ocfs2_write() from tagging the write as an
|
|
|
|
|
* allocating one, which means ocfs2_map_page_blocks() might try to
|
|
|
|
|
* read-in the blocks at the tail of our file. Avoid reading them by
|
|
|
|
|
* testing i_size against each block offset.
|
|
|
|
|
*/
|
|
|
|
|
static int ocfs2_should_read_blk(struct inode *inode, struct page *page,
|
|
|
|
|
unsigned int block_start)
|
|
|
|
|
{
|
|
|
|
|
u64 offset = page_offset(page) + block_start;
|
|
|
|
|
|
|
|
|
|
if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
|
|
if (i_size_read(inode) > offset)
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2007-02-10 12:24:12 +08:00
|
|
|
|
/*
|
2010-10-06 16:47:23 +08:00
|
|
|
|
* Some of this taken from __block_write_begin(). We already have our
|
2007-02-10 12:24:12 +08:00
|
|
|
|
* mapping by now though, and the entire write will be allocating or
|
|
|
|
|
* it won't, so not much need to use BH_New.
|
|
|
|
|
*
|
|
|
|
|
* This will also skip zeroing, which is handled externally.
|
|
|
|
|
*/
|
2007-02-17 03:46:50 +08:00
|
|
|
|
int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
|
|
|
|
|
struct inode *inode, unsigned int from,
|
|
|
|
|
unsigned int to, int new)
|
2007-02-10 12:24:12 +08:00
|
|
|
|
{
|
|
|
|
|
int ret = 0;
|
|
|
|
|
struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
|
|
|
|
|
unsigned int block_end, block_start;
|
2017-02-28 06:28:32 +08:00
|
|
|
|
unsigned int bsize = i_blocksize(inode);
|
2007-02-10 12:24:12 +08:00
|
|
|
|
|
|
|
|
|
if (!page_has_buffers(page))
|
|
|
|
|
create_empty_buffers(page, bsize, 0);
|
|
|
|
|
|
|
|
|
|
head = page_buffers(page);
|
|
|
|
|
for (bh = head, block_start = 0; bh != head || !block_start;
|
|
|
|
|
bh = bh->b_this_page, block_start += bsize) {
|
|
|
|
|
block_end = block_start + bsize;
|
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
clear_buffer_new(bh);
|
|
|
|
|
|
2007-02-10 12:24:12 +08:00
|
|
|
|
/*
|
|
|
|
|
* Ignore blocks outside of our i/o range -
|
|
|
|
|
* they may belong to unallocated clusters.
|
|
|
|
|
*/
|
2007-02-17 03:46:50 +08:00
|
|
|
|
if (block_start >= to || block_end <= from) {
|
2007-02-10 12:24:12 +08:00
|
|
|
|
if (PageUptodate(page))
|
|
|
|
|
set_buffer_uptodate(bh);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* For an allocating write with cluster size >= page
|
|
|
|
|
* size, we always write the entire page.
|
|
|
|
|
*/
|
2007-05-09 08:47:32 +08:00
|
|
|
|
if (new)
|
|
|
|
|
set_buffer_new(bh);
|
2007-02-10 12:24:12 +08:00
|
|
|
|
|
|
|
|
|
if (!buffer_mapped(bh)) {
|
|
|
|
|
map_bh(bh, inode->i_sb, *p_blkno);
|
2016-11-05 01:08:15 +08:00
|
|
|
|
clean_bdev_bh_alias(bh);
|
2007-02-10 12:24:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (PageUptodate(page)) {
|
2021-06-29 10:34:08 +08:00
|
|
|
|
set_buffer_uptodate(bh);
|
2007-02-10 12:24:12 +08:00
|
|
|
|
} else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
|
2007-06-19 02:12:36 +08:00
|
|
|
|
!buffer_new(bh) &&
|
2007-11-02 02:37:48 +08:00
|
|
|
|
ocfs2_should_read_blk(inode, page, block_start) &&
|
2007-06-19 02:12:36 +08:00
|
|
|
|
(block_start < from || block_end > to)) {
|
2016-06-06 03:31:44 +08:00
|
|
|
|
ll_rw_block(REQ_OP_READ, 0, 1, &bh);
|
2007-02-10 12:24:12 +08:00
|
|
|
|
*wait_bh++=bh;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
*p_blkno = *p_blkno + 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* If we issued read requests - let them complete.
|
|
|
|
|
*/
|
|
|
|
|
while(wait_bh > wait) {
|
|
|
|
|
wait_on_buffer(*--wait_bh);
|
|
|
|
|
if (!buffer_uptodate(*wait_bh))
|
|
|
|
|
ret = -EIO;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (ret == 0 || !new)
|
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* If we get -EIO above, zero out any newly allocated blocks
|
|
|
|
|
* to avoid exposing stale data.
|
|
|
|
|
*/
|
|
|
|
|
bh = head;
|
|
|
|
|
block_start = 0;
|
|
|
|
|
do {
|
|
|
|
|
block_end = block_start + bsize;
|
|
|
|
|
if (block_end <= from)
|
|
|
|
|
goto next_bh;
|
|
|
|
|
if (block_start >= to)
|
|
|
|
|
break;
|
|
|
|
|
|
2008-02-05 14:28:29 +08:00
|
|
|
|
zero_user(page, block_start, bh->b_size);
|
2007-02-10 12:24:12 +08:00
|
|
|
|
set_buffer_uptodate(bh);
|
|
|
|
|
mark_buffer_dirty(bh);
|
|
|
|
|
|
|
|
|
|
next_bh:
|
|
|
|
|
block_start = block_end;
|
|
|
|
|
bh = bh->b_this_page;
|
|
|
|
|
} while (bh != head);
|
|
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2016-04-01 20:29:48 +08:00
|
|
|
|
#if (PAGE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
|
2007-05-09 08:47:32 +08:00
|
|
|
|
#define OCFS2_MAX_CTXT_PAGES 1
|
|
|
|
|
#else
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
|
#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_SIZE)
|
2007-05-09 08:47:32 +08:00
|
|
|
|
#endif
|
|
|
|
|
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
|
#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_SIZE / OCFS2_MIN_CLUSTERSIZE)
|
2007-05-09 08:47:32 +08:00
|
|
|
|
|
2016-03-26 05:21:06 +08:00
|
|
|
|
struct ocfs2_unwritten_extent {
|
|
|
|
|
struct list_head ue_node;
|
|
|
|
|
struct list_head ue_ip_node;
|
|
|
|
|
u32 ue_cpos;
|
|
|
|
|
u32 ue_phys;
|
|
|
|
|
};
|
|
|
|
|
|
2007-03-07 09:24:46 +08:00
|
|
|
|
/*
|
2007-05-09 08:47:32 +08:00
|
|
|
|
* Describe the state of a single cluster to be written to.
|
2007-03-07 09:24:46 +08:00
|
|
|
|
*/
|
2007-05-09 08:47:32 +08:00
|
|
|
|
struct ocfs2_write_cluster_desc {
|
|
|
|
|
u32 c_cpos;
|
|
|
|
|
u32 c_phys;
|
|
|
|
|
/*
|
|
|
|
|
* Give this a unique field because c_phys eventually gets
|
|
|
|
|
* filled.
|
|
|
|
|
*/
|
|
|
|
|
unsigned c_new;
|
2016-03-26 05:20:55 +08:00
|
|
|
|
unsigned c_clear_unwritten;
|
2009-08-07 07:12:58 +08:00
|
|
|
|
unsigned c_needs_zero;
|
2007-05-09 08:47:32 +08:00
|
|
|
|
};
|
2007-03-07 09:24:46 +08:00
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
struct ocfs2_write_ctxt {
|
|
|
|
|
/* Logical cluster position / len of write */
|
|
|
|
|
u32 w_cpos;
|
|
|
|
|
u32 w_clen;
|
2007-03-07 09:24:46 +08:00
|
|
|
|
|
2009-08-07 07:12:58 +08:00
|
|
|
|
/* First cluster allocated in a nonsparse extend */
|
|
|
|
|
u32 w_first_new_cpos;
|
|
|
|
|
|
2016-03-26 05:20:52 +08:00
|
|
|
|
/* Type of caller. Must be one of buffer, mmap, direct. */
|
|
|
|
|
ocfs2_write_type_t w_type;
|
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
|
2007-03-07 09:24:46 +08:00
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
/*
|
|
|
|
|
* This is true if page_size > cluster_size.
|
|
|
|
|
*
|
|
|
|
|
* It triggers a set of special cases during write which might
|
|
|
|
|
* have to deal with allocating writes to partial pages.
|
|
|
|
|
*/
|
|
|
|
|
unsigned int w_large_pages;
|
2007-03-07 09:24:46 +08:00
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
/*
|
|
|
|
|
* Pages involved in this write.
|
|
|
|
|
*
|
|
|
|
|
* w_target_page is the page being written to by the user.
|
|
|
|
|
*
|
|
|
|
|
* w_pages is an array of pages which always contains
|
|
|
|
|
* w_target_page, and in the case of an allocating write with
|
|
|
|
|
* page_size < cluster size, it will contain zero'd and mapped
|
|
|
|
|
* pages adjacent to w_target_page which need to be written
|
|
|
|
|
* out in so that future reads from that region will get
|
|
|
|
|
* zero's.
|
|
|
|
|
*/
|
|
|
|
|
unsigned int w_num_pages;
|
2010-06-11 06:21:36 +08:00
|
|
|
|
struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
|
2007-05-09 08:47:32 +08:00
|
|
|
|
struct page *w_target_page;
|
2007-06-07 07:15:24 +08:00
|
|
|
|
|
2011-07-25 01:36:54 +08:00
|
|
|
|
/*
|
|
|
|
|
* w_target_locked is used for page_mkwrite path indicating no unlocking
|
|
|
|
|
* against w_target_page in ocfs2_write_end_nolock.
|
|
|
|
|
*/
|
|
|
|
|
unsigned int w_target_locked:1;
|
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
/*
|
|
|
|
|
* ocfs2_write_end() uses this to know what the real range to
|
|
|
|
|
* write in the target should be.
|
|
|
|
|
*/
|
|
|
|
|
unsigned int w_target_from;
|
|
|
|
|
unsigned int w_target_to;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* We could use journal_current_handle() but this is cleaner,
|
|
|
|
|
* IMHO -Mark
|
|
|
|
|
*/
|
|
|
|
|
handle_t *w_handle;
|
|
|
|
|
|
|
|
|
|
struct buffer_head *w_di_bh;
|
2007-06-19 02:22:56 +08:00
|
|
|
|
|
|
|
|
|
struct ocfs2_cached_dealloc_ctxt w_dealloc;
|
2016-03-26 05:21:06 +08:00
|
|
|
|
|
|
|
|
|
struct list_head w_unwritten_list;
|
2018-02-01 08:15:02 +08:00
|
|
|
|
unsigned int w_unwritten_count;
|
2007-05-09 08:47:32 +08:00
|
|
|
|
};
|
|
|
|
|
|
2007-09-08 05:20:45 +08:00
|
|
|
|
void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
|
2007-05-09 08:47:32 +08:00
|
|
|
|
{
|
|
|
|
|
int i;
|
|
|
|
|
|
2007-09-08 05:20:45 +08:00
|
|
|
|
for(i = 0; i < num_pages; i++) {
|
|
|
|
|
if (pages[i]) {
|
|
|
|
|
unlock_page(pages[i]);
|
|
|
|
|
mark_page_accessed(pages[i]);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
|
put_page(pages[i]);
|
2007-09-08 05:20:45 +08:00
|
|
|
|
}
|
2007-03-07 09:24:46 +08:00
|
|
|
|
}
|
2007-09-08 05:20:45 +08:00
|
|
|
|
}
|
|
|
|
|
|
ocfs2: fix journal commit deadlock
For buffer write, page lock will be got in write_begin and released in
write_end, in ocfs2_write_end_nolock(), before it unlock the page in
ocfs2_free_write_ctxt(), it calls ocfs2_run_deallocs(), this will ask
for the read lock of journal->j_trans_barrier. Holding page lock and
ask for journal->j_trans_barrier breaks the locking order.
This will cause a deadlock with journal commit threads, ocfs2cmt will
get write lock of journal->j_trans_barrier first, then it wakes up
kjournald2 to do the commit work, at last it waits until done. To
commit journal, kjournald2 needs flushing data first, it needs get the
cache page lock.
Since some ocfs2 cluster locks are holding by write process, this
deadlock may hung the whole cluster.
unlock pages before ocfs2_run_deallocs() can fix the locking order, also
put unlock before ocfs2_commit_trans() to make page lock is unlocked
before j_trans_barrier to preserve unlocking order.
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Wengang Wang <wen.gang.wang@oracle.com>
Cc: <stable@vger.kernel.org>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-19 08:17:37 +08:00
|
|
|
|
static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc)
|
2007-09-08 05:20:45 +08:00
|
|
|
|
{
|
2011-07-25 01:36:54 +08:00
|
|
|
|
int i;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* w_target_locked is only set to true in the page_mkwrite() case.
|
|
|
|
|
* The intent is to allow us to lock the target page from write_begin()
|
|
|
|
|
* to write_end(). The caller must hold a ref on w_target_page.
|
|
|
|
|
*/
|
|
|
|
|
if (wc->w_target_locked) {
|
|
|
|
|
BUG_ON(!wc->w_target_page);
|
|
|
|
|
for (i = 0; i < wc->w_num_pages; i++) {
|
|
|
|
|
if (wc->w_target_page == wc->w_pages[i]) {
|
|
|
|
|
wc->w_pages[i] = NULL;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
mark_page_accessed(wc->w_target_page);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
|
put_page(wc->w_target_page);
|
2011-07-25 01:36:54 +08:00
|
|
|
|
}
|
2007-09-08 05:20:45 +08:00
|
|
|
|
ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
|
ocfs2: fix journal commit deadlock
For buffer write, page lock will be got in write_begin and released in
write_end, in ocfs2_write_end_nolock(), before it unlock the page in
ocfs2_free_write_ctxt(), it calls ocfs2_run_deallocs(), this will ask
for the read lock of journal->j_trans_barrier. Holding page lock and
ask for journal->j_trans_barrier breaks the locking order.
This will cause a deadlock with journal commit threads, ocfs2cmt will
get write lock of journal->j_trans_barrier first, then it wakes up
kjournald2 to do the commit work, at last it waits until done. To
commit journal, kjournald2 needs flushing data first, it needs get the
cache page lock.
Since some ocfs2 cluster locks are holding by write process, this
deadlock may hung the whole cluster.
unlock pages before ocfs2_run_deallocs() can fix the locking order, also
put unlock before ocfs2_commit_trans() to make page lock is unlocked
before j_trans_barrier to preserve unlocking order.
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Wengang Wang <wen.gang.wang@oracle.com>
Cc: <stable@vger.kernel.org>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-19 08:17:37 +08:00
|
|
|
|
}
|
2007-03-07 09:24:46 +08:00
|
|
|
|
|
2016-03-26 05:21:06 +08:00
|
|
|
|
static void ocfs2_free_unwritten_list(struct inode *inode,
|
|
|
|
|
struct list_head *head)
|
|
|
|
|
{
|
|
|
|
|
struct ocfs2_inode_info *oi = OCFS2_I(inode);
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
struct ocfs2_unwritten_extent *ue = NULL, *tmp = NULL;
|
2016-03-26 05:21:06 +08:00
|
|
|
|
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
list_for_each_entry_safe(ue, tmp, head, ue_node) {
|
|
|
|
|
list_del(&ue->ue_node);
|
2016-03-26 05:21:06 +08:00
|
|
|
|
spin_lock(&oi->ip_lock);
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
list_del(&ue->ue_ip_node);
|
2016-03-26 05:21:06 +08:00
|
|
|
|
spin_unlock(&oi->ip_lock);
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
kfree(ue);
|
2016-03-26 05:21:06 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ocfs2_free_write_ctxt(struct inode *inode,
|
|
|
|
|
struct ocfs2_write_ctxt *wc)
|
ocfs2: fix journal commit deadlock
For buffer write, page lock will be got in write_begin and released in
write_end, in ocfs2_write_end_nolock(), before it unlock the page in
ocfs2_free_write_ctxt(), it calls ocfs2_run_deallocs(), this will ask
for the read lock of journal->j_trans_barrier. Holding page lock and
ask for journal->j_trans_barrier breaks the locking order.
This will cause a deadlock with journal commit threads, ocfs2cmt will
get write lock of journal->j_trans_barrier first, then it wakes up
kjournald2 to do the commit work, at last it waits until done. To
commit journal, kjournald2 needs flushing data first, it needs get the
cache page lock.
Since some ocfs2 cluster locks are holding by write process, this
deadlock may hung the whole cluster.
unlock pages before ocfs2_run_deallocs() can fix the locking order, also
put unlock before ocfs2_commit_trans() to make page lock is unlocked
before j_trans_barrier to preserve unlocking order.
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Wengang Wang <wen.gang.wang@oracle.com>
Cc: <stable@vger.kernel.org>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-19 08:17:37 +08:00
|
|
|
|
{
|
2016-03-26 05:21:06 +08:00
|
|
|
|
ocfs2_free_unwritten_list(inode, &wc->w_unwritten_list);
|
ocfs2: fix journal commit deadlock
For buffer write, page lock will be got in write_begin and released in
write_end, in ocfs2_write_end_nolock(), before it unlock the page in
ocfs2_free_write_ctxt(), it calls ocfs2_run_deallocs(), this will ask
for the read lock of journal->j_trans_barrier. Holding page lock and
ask for journal->j_trans_barrier breaks the locking order.
This will cause a deadlock with journal commit threads, ocfs2cmt will
get write lock of journal->j_trans_barrier first, then it wakes up
kjournald2 to do the commit work, at last it waits until done. To
commit journal, kjournald2 needs flushing data first, it needs get the
cache page lock.
Since some ocfs2 cluster locks are holding by write process, this
deadlock may hung the whole cluster.
unlock pages before ocfs2_run_deallocs() can fix the locking order, also
put unlock before ocfs2_commit_trans() to make page lock is unlocked
before j_trans_barrier to preserve unlocking order.
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Wengang Wang <wen.gang.wang@oracle.com>
Cc: <stable@vger.kernel.org>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-19 08:17:37 +08:00
|
|
|
|
ocfs2_unlock_pages(wc);
|
2007-05-09 08:47:32 +08:00
|
|
|
|
brelse(wc->w_di_bh);
|
|
|
|
|
kfree(wc);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
|
|
|
|
|
struct ocfs2_super *osb, loff_t pos,
|
2016-03-26 05:20:52 +08:00
|
|
|
|
unsigned len, ocfs2_write_type_t type,
|
|
|
|
|
struct buffer_head *di_bh)
|
2007-05-09 08:47:32 +08:00
|
|
|
|
{
|
2007-09-06 08:02:25 +08:00
|
|
|
|
u32 cend;
|
2007-05-09 08:47:32 +08:00
|
|
|
|
struct ocfs2_write_ctxt *wc;
|
|
|
|
|
|
|
|
|
|
wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS);
|
|
|
|
|
if (!wc)
|
|
|
|
|
return -ENOMEM;
|
2007-03-07 09:24:46 +08:00
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
wc->w_cpos = pos >> osb->s_clustersize_bits;
|
2009-08-07 07:12:58 +08:00
|
|
|
|
wc->w_first_new_cpos = UINT_MAX;
|
2007-09-06 08:02:25 +08:00
|
|
|
|
cend = (pos + len - 1) >> osb->s_clustersize_bits;
|
|
|
|
|
wc->w_clen = cend - wc->w_cpos + 1;
|
2007-05-10 06:14:45 +08:00
|
|
|
|
get_bh(di_bh);
|
|
|
|
|
wc->w_di_bh = di_bh;
|
2016-03-26 05:20:52 +08:00
|
|
|
|
wc->w_type = type;
|
2007-03-07 09:24:46 +08:00
|
|
|
|
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
|
if (unlikely(PAGE_SHIFT > osb->s_clustersize_bits))
|
2007-05-09 08:47:32 +08:00
|
|
|
|
wc->w_large_pages = 1;
|
|
|
|
|
else
|
|
|
|
|
wc->w_large_pages = 0;
|
|
|
|
|
|
2007-06-19 02:22:56 +08:00
|
|
|
|
ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
|
2016-03-26 05:21:06 +08:00
|
|
|
|
INIT_LIST_HEAD(&wc->w_unwritten_list);
|
2007-06-19 02:22:56 +08:00
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
*wcp = wc;
|
2007-03-07 09:24:46 +08:00
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
return 0;
|
2007-03-07 09:24:46 +08:00
|
|
|
|
}
|
|
|
|
|
|
2007-02-10 12:24:12 +08:00
|
|
|
|
/*
|
2007-05-09 08:47:32 +08:00
|
|
|
|
* If a page has any new buffers, zero them out here, and mark them uptodate
|
|
|
|
|
* and dirty so they'll be written out (in order to prevent uninitialised
|
|
|
|
|
* block data from leaking). And clear the new bit.
|
2007-02-10 12:24:12 +08:00
|
|
|
|
*/
|
2007-05-09 08:47:32 +08:00
|
|
|
|
static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to)
|
2007-02-10 12:24:12 +08:00
|
|
|
|
{
|
2007-05-09 08:47:32 +08:00
|
|
|
|
unsigned int block_start, block_end;
|
|
|
|
|
struct buffer_head *head, *bh;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
BUG_ON(!PageLocked(page));
|
|
|
|
|
if (!page_has_buffers(page))
|
|
|
|
|
return;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
bh = head = page_buffers(page);
|
|
|
|
|
block_start = 0;
|
|
|
|
|
do {
|
|
|
|
|
block_end = block_start + bh->b_size;
|
|
|
|
|
|
|
|
|
|
if (buffer_new(bh)) {
|
|
|
|
|
if (block_end > from && block_start < to) {
|
|
|
|
|
if (!PageUptodate(page)) {
|
|
|
|
|
unsigned start, end;
|
|
|
|
|
|
|
|
|
|
start = max(from, block_start);
|
|
|
|
|
end = min(to, block_end);
|
|
|
|
|
|
2008-02-05 14:28:29 +08:00
|
|
|
|
zero_user_segment(page, start, end);
|
2007-05-09 08:47:32 +08:00
|
|
|
|
set_buffer_uptodate(bh);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
clear_buffer_new(bh);
|
|
|
|
|
mark_buffer_dirty(bh);
|
|
|
|
|
}
|
|
|
|
|
}
|
2007-02-10 12:24:12 +08:00
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
block_start = block_end;
|
|
|
|
|
bh = bh->b_this_page;
|
|
|
|
|
} while (bh != head);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Only called when we have a failure during allocating write to write
|
|
|
|
|
* zero's to the newly allocated region.
|
|
|
|
|
*/
|
|
|
|
|
static void ocfs2_write_failure(struct inode *inode,
|
|
|
|
|
struct ocfs2_write_ctxt *wc,
|
|
|
|
|
loff_t user_pos, unsigned user_len)
|
|
|
|
|
{
|
|
|
|
|
int i;
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
|
unsigned from = user_pos & (PAGE_SIZE - 1),
|
2007-09-19 08:49:29 +08:00
|
|
|
|
to = user_pos + user_len;
|
2007-05-09 08:47:32 +08:00
|
|
|
|
struct page *tmppage;
|
|
|
|
|
|
2016-03-26 05:20:58 +08:00
|
|
|
|
if (wc->w_target_page)
|
|
|
|
|
ocfs2_zero_new_buffers(wc->w_target_page, from, to);
|
2007-02-10 12:24:12 +08:00
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
for(i = 0; i < wc->w_num_pages; i++) {
|
|
|
|
|
tmppage = wc->w_pages[i];
|
2007-02-10 12:24:12 +08:00
|
|
|
|
|
2016-03-26 05:20:58 +08:00
|
|
|
|
if (tmppage && page_has_buffers(tmppage)) {
|
2008-11-19 08:53:43 +08:00
|
|
|
|
if (ocfs2_should_order_data(inode))
|
2019-09-24 06:33:08 +08:00
|
|
|
|
ocfs2_jbd2_inode_add_write(wc->w_handle, inode,
|
|
|
|
|
user_pos, user_len);
|
2008-07-17 08:22:22 +08:00
|
|
|
|
|
|
|
|
|
block_commit_write(tmppage, from, to);
|
|
|
|
|
}
|
2007-02-10 12:24:12 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
|
|
|
|
|
struct ocfs2_write_ctxt *wc,
|
|
|
|
|
struct page *page, u32 cpos,
|
|
|
|
|
loff_t user_pos, unsigned user_len,
|
|
|
|
|
int new)
|
2007-02-10 12:24:12 +08:00
|
|
|
|
{
|
2007-05-09 08:47:32 +08:00
|
|
|
|
int ret;
|
|
|
|
|
unsigned int map_from = 0, map_to = 0;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
unsigned int cluster_start, cluster_end;
|
2007-05-09 08:47:32 +08:00
|
|
|
|
unsigned int user_data_from = 0, user_data_to = 0;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,
|
2007-02-10 12:24:12 +08:00
|
|
|
|
&cluster_start, &cluster_end);
|
|
|
|
|
|
2011-02-17 23:44:40 +08:00
|
|
|
|
/* treat the write as new if the a hole/lseek spanned across
|
|
|
|
|
* the page boundary.
|
|
|
|
|
*/
|
|
|
|
|
new = new | ((i_size_read(inode) <= page_offset(page)) &&
|
|
|
|
|
(page_offset(page) <= user_pos));
|
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
if (page == wc->w_target_page) {
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
|
map_from = user_pos & (PAGE_SIZE - 1);
|
2007-05-09 08:47:32 +08:00
|
|
|
|
map_to = map_from + user_len;
|
|
|
|
|
|
|
|
|
|
if (new)
|
|
|
|
|
ret = ocfs2_map_page_blocks(page, p_blkno, inode,
|
|
|
|
|
cluster_start, cluster_end,
|
|
|
|
|
new);
|
|
|
|
|
else
|
|
|
|
|
ret = ocfs2_map_page_blocks(page, p_blkno, inode,
|
|
|
|
|
map_from, map_to, new);
|
|
|
|
|
if (ret) {
|
2007-02-10 12:24:12 +08:00
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
user_data_from = map_from;
|
|
|
|
|
user_data_to = map_to;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
if (new) {
|
2007-05-09 08:47:32 +08:00
|
|
|
|
map_from = cluster_start;
|
|
|
|
|
map_to = cluster_end;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
/*
|
|
|
|
|
* If we haven't allocated the new page yet, we
|
|
|
|
|
* shouldn't be writing it out without copying user
|
|
|
|
|
* data. This is likely a math error from the caller.
|
|
|
|
|
*/
|
|
|
|
|
BUG_ON(!new);
|
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
map_from = cluster_start;
|
|
|
|
|
map_to = cluster_end;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
|
|
|
|
|
ret = ocfs2_map_page_blocks(page, p_blkno, inode,
|
2007-05-09 08:47:32 +08:00
|
|
|
|
cluster_start, cluster_end, new);
|
2007-02-10 12:24:12 +08:00
|
|
|
|
if (ret) {
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Parts of newly allocated pages need to be zero'd.
|
|
|
|
|
*
|
|
|
|
|
* Above, we have also rewritten 'to' and 'from' - as far as
|
|
|
|
|
* the rest of the function is concerned, the entire cluster
|
|
|
|
|
* range inside of a page needs to be written.
|
|
|
|
|
*
|
|
|
|
|
* We can skip this if the page is up to date - it's already
|
|
|
|
|
* been zero'd from being read in as a hole.
|
|
|
|
|
*/
|
|
|
|
|
if (new && !PageUptodate(page))
|
|
|
|
|
ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
|
2007-05-09 08:47:32 +08:00
|
|
|
|
cpos, user_data_from, user_data_to);
|
2007-02-10 12:24:12 +08:00
|
|
|
|
|
|
|
|
|
flush_dcache_page(page);
|
|
|
|
|
|
|
|
|
|
out:
|
2007-05-09 08:47:32 +08:00
|
|
|
|
return ret;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
2007-05-09 08:47:32 +08:00
|
|
|
|
* This function will only grab one clusters worth of pages.
|
2007-02-10 12:24:12 +08:00
|
|
|
|
*/
|
2007-05-09 08:47:32 +08:00
|
|
|
|
static int ocfs2_grab_pages_for_write(struct address_space *mapping,
|
|
|
|
|
struct ocfs2_write_ctxt *wc,
|
2010-07-03 08:20:27 +08:00
|
|
|
|
u32 cpos, loff_t user_pos,
|
|
|
|
|
unsigned user_len, int new,
|
2007-05-10 06:16:19 +08:00
|
|
|
|
struct page *mmap_page)
|
2007-02-10 12:24:12 +08:00
|
|
|
|
{
|
2007-05-09 08:47:32 +08:00
|
|
|
|
int ret = 0, i;
|
2010-07-03 08:20:27 +08:00
|
|
|
|
unsigned long start, target_index, end_index, index;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
struct inode *inode = mapping->host;
|
2010-07-03 08:20:27 +08:00
|
|
|
|
loff_t last_byte;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
|
target_index = user_pos >> PAGE_SHIFT;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Figure out how many pages we'll be manipulating here. For
|
2007-02-17 03:46:50 +08:00
|
|
|
|
* non allocating write, we just change the one
|
2010-07-03 08:20:27 +08:00
|
|
|
|
* page. Otherwise, we'll need a whole clusters worth. If we're
|
|
|
|
|
* writing past i_size, we only need enough pages to cover the
|
|
|
|
|
* last page of the write.
|
2007-02-10 12:24:12 +08:00
|
|
|
|
*/
|
|
|
|
|
if (new) {
|
2007-05-09 08:47:32 +08:00
|
|
|
|
wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
|
|
|
|
|
start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
|
2010-07-03 08:20:27 +08:00
|
|
|
|
/*
|
|
|
|
|
* We need the index *past* the last page we could possibly
|
|
|
|
|
* touch. This is the page past the end of the write or
|
|
|
|
|
* i_size, whichever is greater.
|
|
|
|
|
*/
|
|
|
|
|
last_byte = max(user_pos + user_len, i_size_read(inode));
|
|
|
|
|
BUG_ON(last_byte < 1);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
|
end_index = ((last_byte - 1) >> PAGE_SHIFT) + 1;
|
2010-07-03 08:20:27 +08:00
|
|
|
|
if ((start + wc->w_num_pages) > end_index)
|
|
|
|
|
wc->w_num_pages = end_index - start;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
} else {
|
2007-05-09 08:47:32 +08:00
|
|
|
|
wc->w_num_pages = 1;
|
|
|
|
|
start = target_index;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
}
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
|
end_index = (user_pos + user_len - 1) >> PAGE_SHIFT;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
for(i = 0; i < wc->w_num_pages; i++) {
|
2007-02-10 12:24:12 +08:00
|
|
|
|
index = start + i;
|
|
|
|
|
|
2016-03-26 05:20:58 +08:00
|
|
|
|
if (index >= target_index && index <= end_index &&
|
|
|
|
|
wc->w_type == OCFS2_WRITE_MMAP) {
|
2007-05-10 06:16:19 +08:00
|
|
|
|
/*
|
|
|
|
|
* ocfs2_pagemkwrite() is a little different
|
|
|
|
|
* and wants us to directly use the page
|
|
|
|
|
* passed in.
|
|
|
|
|
*/
|
|
|
|
|
lock_page(mmap_page);
|
|
|
|
|
|
2011-07-25 01:36:54 +08:00
|
|
|
|
/* Exit and let the caller retry */
|
2007-05-10 06:16:19 +08:00
|
|
|
|
if (mmap_page->mapping != mapping) {
|
2011-07-25 01:36:54 +08:00
|
|
|
|
WARN_ON(mmap_page->mapping);
|
2007-05-10 06:16:19 +08:00
|
|
|
|
unlock_page(mmap_page);
|
2011-07-25 01:36:54 +08:00
|
|
|
|
ret = -EAGAIN;
|
2007-05-10 06:16:19 +08:00
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
|
get_page(mmap_page);
|
2007-05-10 06:16:19 +08:00
|
|
|
|
wc->w_pages[i] = mmap_page;
|
2011-07-25 01:36:54 +08:00
|
|
|
|
wc->w_target_locked = true;
|
2016-03-26 05:20:58 +08:00
|
|
|
|
} else if (index >= target_index && index <= end_index &&
|
|
|
|
|
wc->w_type == OCFS2_WRITE_DIRECT) {
|
|
|
|
|
/* Direct write has no mapping page. */
|
|
|
|
|
wc->w_pages[i] = NULL;
|
|
|
|
|
continue;
|
2007-05-10 06:16:19 +08:00
|
|
|
|
} else {
|
|
|
|
|
wc->w_pages[i] = find_or_create_page(mapping, index,
|
|
|
|
|
GFP_NOFS);
|
|
|
|
|
if (!wc->w_pages[i]) {
|
|
|
|
|
ret = -ENOMEM;
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
2007-02-10 12:24:12 +08:00
|
|
|
|
}
|
2013-02-22 08:42:57 +08:00
|
|
|
|
wait_for_stable_page(wc->w_pages[i]);
|
2007-05-09 08:47:32 +08:00
|
|
|
|
|
|
|
|
|
if (index == target_index)
|
|
|
|
|
wc->w_target_page = wc->w_pages[i];
|
2007-02-10 12:24:12 +08:00
|
|
|
|
}
|
2007-05-09 08:47:32 +08:00
|
|
|
|
out:
|
2011-07-25 01:36:54 +08:00
|
|
|
|
if (ret)
|
|
|
|
|
wc->w_target_locked = false;
|
2007-05-09 08:47:32 +08:00
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Prepare a single cluster for write one cluster into the file.
|
|
|
|
|
*/
|
|
|
|
|
static int ocfs2_write_cluster(struct address_space *mapping,
|
2016-03-26 05:21:03 +08:00
|
|
|
|
u32 *phys, unsigned int new,
|
2016-03-26 05:20:55 +08:00
|
|
|
|
unsigned int clear_unwritten,
|
2009-08-07 07:12:58 +08:00
|
|
|
|
unsigned int should_zero,
|
2007-06-19 02:22:56 +08:00
|
|
|
|
struct ocfs2_alloc_context *data_ac,
|
2007-05-09 08:47:32 +08:00
|
|
|
|
struct ocfs2_alloc_context *meta_ac,
|
|
|
|
|
struct ocfs2_write_ctxt *wc, u32 cpos,
|
|
|
|
|
loff_t user_pos, unsigned user_len)
|
|
|
|
|
{
|
2016-03-26 05:20:55 +08:00
|
|
|
|
int ret, i;
|
2016-03-26 05:21:03 +08:00
|
|
|
|
u64 p_blkno;
|
2007-05-09 08:47:32 +08:00
|
|
|
|
struct inode *inode = mapping->host;
|
2008-08-21 10:36:33 +08:00
|
|
|
|
struct ocfs2_extent_tree et;
|
2016-03-26 05:21:03 +08:00
|
|
|
|
int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
|
2007-05-09 08:47:32 +08:00
|
|
|
|
|
2007-02-10 12:24:12 +08:00
|
|
|
|
if (new) {
|
2007-05-09 08:47:32 +08:00
|
|
|
|
u32 tmp_pos;
|
|
|
|
|
|
2007-02-10 12:24:12 +08:00
|
|
|
|
/*
|
|
|
|
|
* This is safe to call with the page locks - it won't take
|
|
|
|
|
* any additional semaphores or cluster locks.
|
|
|
|
|
*/
|
2007-05-09 08:47:32 +08:00
|
|
|
|
tmp_pos = cpos;
|
2008-08-18 17:38:45 +08:00
|
|
|
|
ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
|
2016-03-26 05:20:55 +08:00
|
|
|
|
&tmp_pos, 1, !clear_unwritten,
|
|
|
|
|
wc->w_di_bh, wc->w_handle,
|
|
|
|
|
data_ac, meta_ac, NULL);
|
2007-02-10 12:24:12 +08:00
|
|
|
|
/*
|
|
|
|
|
* This shouldn't happen because we must have already
|
|
|
|
|
* calculated the correct meta data allocation required. The
|
|
|
|
|
* internal tree allocation code should know how to increase
|
|
|
|
|
* transaction credits itself.
|
|
|
|
|
*
|
|
|
|
|
* If need be, we could handle -EAGAIN for a
|
|
|
|
|
* RESTART_TRANS here.
|
|
|
|
|
*/
|
|
|
|
|
mlog_bug_on_msg(ret == -EAGAIN,
|
|
|
|
|
"Inode %llu: EAGAIN return during allocation.\n",
|
|
|
|
|
(unsigned long long)OCFS2_I(inode)->ip_blkno);
|
|
|
|
|
if (ret < 0) {
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
2016-03-26 05:20:55 +08:00
|
|
|
|
} else if (clear_unwritten) {
|
2009-02-13 19:54:22 +08:00
|
|
|
|
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
|
|
|
|
|
wc->w_di_bh);
|
2008-08-21 10:36:33 +08:00
|
|
|
|
ret = ocfs2_mark_extent_written(inode, &et,
|
2016-03-26 05:21:03 +08:00
|
|
|
|
wc->w_handle, cpos, 1, *phys,
|
2008-08-21 10:36:33 +08:00
|
|
|
|
meta_ac, &wc->w_dealloc);
|
2007-06-19 02:22:56 +08:00
|
|
|
|
if (ret < 0) {
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
}
|
2007-05-09 08:47:32 +08:00
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* The only reason this should fail is due to an inability to
|
|
|
|
|
* find the extent added.
|
|
|
|
|
*/
|
2016-03-26 05:21:03 +08:00
|
|
|
|
ret = ocfs2_get_clusters(inode, cpos, phys, NULL, NULL);
|
2007-02-10 12:24:12 +08:00
|
|
|
|
if (ret < 0) {
|
2014-12-11 07:42:02 +08:00
|
|
|
|
mlog(ML_ERROR, "Get physical blkno failed for inode %llu, "
|
2016-03-26 05:21:03 +08:00
|
|
|
|
"at logical cluster %u",
|
|
|
|
|
(unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
|
2007-02-10 12:24:12 +08:00
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
2016-03-26 05:21:03 +08:00
|
|
|
|
BUG_ON(*phys == 0);
|
|
|
|
|
|
|
|
|
|
p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *phys);
|
|
|
|
|
if (!should_zero)
|
|
|
|
|
p_blkno += (user_pos >> inode->i_sb->s_blocksize_bits) & (u64)(bpc - 1);
|
2007-02-10 12:24:12 +08:00
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
for(i = 0; i < wc->w_num_pages; i++) {
|
|
|
|
|
int tmpret;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
|
2016-03-26 05:20:58 +08:00
|
|
|
|
/* This is the direct io target page. */
|
|
|
|
|
if (wc->w_pages[i] == NULL) {
|
|
|
|
|
p_blkno++;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
|
|
|
|
|
wc->w_pages[i], cpos,
|
2007-06-19 02:22:56 +08:00
|
|
|
|
user_pos, user_len,
|
|
|
|
|
should_zero);
|
2007-05-09 08:47:32 +08:00
|
|
|
|
if (tmpret) {
|
|
|
|
|
mlog_errno(tmpret);
|
|
|
|
|
if (ret == 0)
|
2009-07-13 11:38:23 +08:00
|
|
|
|
ret = tmpret;
|
2007-05-09 08:47:32 +08:00
|
|
|
|
}
|
2007-02-10 12:24:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
/*
|
|
|
|
|
* We only have cleanup to do in case of allocating write.
|
|
|
|
|
*/
|
|
|
|
|
if (ret && new)
|
|
|
|
|
ocfs2_write_failure(inode, wc, user_pos, user_len);
|
|
|
|
|
|
2007-02-10 12:24:12 +08:00
|
|
|
|
out:
|
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
return ret;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
2007-05-15 09:09:54 +08:00
|
|
|
|
static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
|
|
|
|
|
struct ocfs2_alloc_context *data_ac,
|
|
|
|
|
struct ocfs2_alloc_context *meta_ac,
|
|
|
|
|
struct ocfs2_write_ctxt *wc,
|
|
|
|
|
loff_t pos, unsigned len)
|
|
|
|
|
{
|
|
|
|
|
int ret, i;
|
2007-09-18 00:06:29 +08:00
|
|
|
|
loff_t cluster_off;
|
|
|
|
|
unsigned int local_len = len;
|
2007-05-15 09:09:54 +08:00
|
|
|
|
struct ocfs2_write_cluster_desc *desc;
|
2007-09-18 00:06:29 +08:00
|
|
|
|
struct ocfs2_super *osb = OCFS2_SB(mapping->host->i_sb);
|
2007-05-15 09:09:54 +08:00
|
|
|
|
|
|
|
|
|
for (i = 0; i < wc->w_clen; i++) {
|
|
|
|
|
desc = &wc->w_desc[i];
|
|
|
|
|
|
2007-09-18 00:06:29 +08:00
|
|
|
|
/*
|
|
|
|
|
* We have to make sure that the total write passed in
|
|
|
|
|
* doesn't extend past a single cluster.
|
|
|
|
|
*/
|
|
|
|
|
local_len = len;
|
|
|
|
|
cluster_off = pos & (osb->s_clustersize - 1);
|
|
|
|
|
if ((cluster_off + local_len) > osb->s_clustersize)
|
|
|
|
|
local_len = osb->s_clustersize - cluster_off;
|
|
|
|
|
|
2016-03-26 05:21:03 +08:00
|
|
|
|
ret = ocfs2_write_cluster(mapping, &desc->c_phys,
|
2016-03-26 05:20:55 +08:00
|
|
|
|
desc->c_new,
|
|
|
|
|
desc->c_clear_unwritten,
|
2009-08-07 07:12:58 +08:00
|
|
|
|
desc->c_needs_zero,
|
|
|
|
|
data_ac, meta_ac,
|
2007-09-18 00:06:29 +08:00
|
|
|
|
wc, desc->c_cpos, pos, local_len);
|
2007-05-15 09:09:54 +08:00
|
|
|
|
if (ret) {
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
2007-09-18 00:06:29 +08:00
|
|
|
|
|
|
|
|
|
len -= local_len;
|
|
|
|
|
pos += local_len;
|
2007-05-15 09:09:54 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ret = 0;
|
|
|
|
|
out:
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
/*
|
|
|
|
|
* ocfs2_write_end() wants to know which parts of the target page it
|
|
|
|
|
* should complete the write on. It's easiest to compute them ahead of
|
|
|
|
|
* time when a more complete view of the write is available.
|
|
|
|
|
*/
|
|
|
|
|
static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
|
|
|
|
|
struct ocfs2_write_ctxt *wc,
|
|
|
|
|
loff_t pos, unsigned len, int alloc)
|
2007-02-10 12:24:12 +08:00
|
|
|
|
{
|
2007-05-09 08:47:32 +08:00
|
|
|
|
struct ocfs2_write_cluster_desc *desc;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
|
wc->w_target_from = pos & (PAGE_SIZE - 1);
|
2007-05-09 08:47:32 +08:00
|
|
|
|
wc->w_target_to = wc->w_target_from + len;
|
|
|
|
|
|
|
|
|
|
if (alloc == 0)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Allocating write - we may have different boundaries based
|
|
|
|
|
* on page size and cluster size.
|
|
|
|
|
*
|
|
|
|
|
* NOTE: We can no longer compute one value from the other as
|
|
|
|
|
* the actual write length and user provided length may be
|
|
|
|
|
* different.
|
|
|
|
|
*/
|
2007-02-10 12:24:12 +08:00
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
if (wc->w_large_pages) {
|
|
|
|
|
/*
|
|
|
|
|
* We only care about the 1st and last cluster within
|
2007-06-19 02:22:56 +08:00
|
|
|
|
* our range and whether they should be zero'd or not. Either
|
2007-05-09 08:47:32 +08:00
|
|
|
|
* value may be extended out to the start/end of a
|
|
|
|
|
* newly allocated cluster.
|
|
|
|
|
*/
|
|
|
|
|
desc = &wc->w_desc[0];
|
2009-08-07 07:12:58 +08:00
|
|
|
|
if (desc->c_needs_zero)
|
2007-05-09 08:47:32 +08:00
|
|
|
|
ocfs2_figure_cluster_boundaries(osb,
|
|
|
|
|
desc->c_cpos,
|
|
|
|
|
&wc->w_target_from,
|
|
|
|
|
NULL);
|
|
|
|
|
|
|
|
|
|
desc = &wc->w_desc[wc->w_clen - 1];
|
2009-08-07 07:12:58 +08:00
|
|
|
|
if (desc->c_needs_zero)
|
2007-05-09 08:47:32 +08:00
|
|
|
|
ocfs2_figure_cluster_boundaries(osb,
|
|
|
|
|
desc->c_cpos,
|
|
|
|
|
NULL,
|
|
|
|
|
&wc->w_target_to);
|
|
|
|
|
} else {
|
|
|
|
|
wc->w_target_from = 0;
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
|
wc->w_target_to = PAGE_SIZE;
|
2007-05-09 08:47:32 +08:00
|
|
|
|
}
|
2007-02-10 12:24:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
2016-03-26 05:21:06 +08:00
|
|
|
|
/*
|
|
|
|
|
* Check if this extent is marked UNWRITTEN by direct io. If so, we need not to
|
|
|
|
|
* do the zero work. And should not to clear UNWRITTEN since it will be cleared
|
|
|
|
|
* by the direct io procedure.
|
|
|
|
|
* If this is a new extent that allocated by direct io, we should mark it in
|
|
|
|
|
* the ip_unwritten_list.
|
|
|
|
|
*/
|
|
|
|
|
static int ocfs2_unwritten_check(struct inode *inode,
|
|
|
|
|
struct ocfs2_write_ctxt *wc,
|
|
|
|
|
struct ocfs2_write_cluster_desc *desc)
|
|
|
|
|
{
|
|
|
|
|
struct ocfs2_inode_info *oi = OCFS2_I(inode);
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
struct ocfs2_unwritten_extent *ue = NULL, *new = NULL;
|
2016-03-26 05:21:06 +08:00
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
|
|
if (!desc->c_needs_zero)
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
retry:
|
|
|
|
|
spin_lock(&oi->ip_lock);
|
|
|
|
|
/* Needs not to zero no metter buffer or direct. The one who is zero
|
|
|
|
|
* the cluster is doing zero. And he will clear unwritten after all
|
|
|
|
|
* cluster io finished. */
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
list_for_each_entry(ue, &oi->ip_unwritten_list, ue_ip_node) {
|
|
|
|
|
if (desc->c_cpos == ue->ue_cpos) {
|
2016-03-26 05:21:06 +08:00
|
|
|
|
BUG_ON(desc->c_new);
|
|
|
|
|
desc->c_needs_zero = 0;
|
|
|
|
|
desc->c_clear_unwritten = 0;
|
|
|
|
|
goto unlock;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (wc->w_type != OCFS2_WRITE_DIRECT)
|
|
|
|
|
goto unlock;
|
|
|
|
|
|
|
|
|
|
if (new == NULL) {
|
|
|
|
|
spin_unlock(&oi->ip_lock);
|
|
|
|
|
new = kmalloc(sizeof(struct ocfs2_unwritten_extent),
|
|
|
|
|
GFP_NOFS);
|
|
|
|
|
if (new == NULL) {
|
|
|
|
|
ret = -ENOMEM;
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
goto retry;
|
|
|
|
|
}
|
|
|
|
|
/* This direct write will doing zero. */
|
|
|
|
|
new->ue_cpos = desc->c_cpos;
|
|
|
|
|
new->ue_phys = desc->c_phys;
|
|
|
|
|
desc->c_clear_unwritten = 0;
|
|
|
|
|
list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
|
|
|
|
|
list_add_tail(&new->ue_node, &wc->w_unwritten_list);
|
2018-02-01 08:15:02 +08:00
|
|
|
|
wc->w_unwritten_count++;
|
2016-03-26 05:21:06 +08:00
|
|
|
|
new = NULL;
|
|
|
|
|
unlock:
|
|
|
|
|
spin_unlock(&oi->ip_lock);
|
|
|
|
|
out:
|
2018-10-27 06:02:48 +08:00
|
|
|
|
kfree(new);
|
2016-03-26 05:21:06 +08:00
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2007-05-15 09:09:54 +08:00
|
|
|
|
/*
|
|
|
|
|
* Populate each single-cluster write descriptor in the write context
|
|
|
|
|
* with information about the i/o to be done.
|
2007-06-19 02:22:56 +08:00
|
|
|
|
*
|
|
|
|
|
* Returns the number of clusters that will have to be allocated, as
|
|
|
|
|
* well as a worst case estimate of the number of extent records that
|
|
|
|
|
* would have to be created during a write to an unwritten region.
|
2007-05-15 09:09:54 +08:00
|
|
|
|
*/
|
|
|
|
|
static int ocfs2_populate_write_desc(struct inode *inode,
|
|
|
|
|
struct ocfs2_write_ctxt *wc,
|
2007-06-19 02:22:56 +08:00
|
|
|
|
unsigned int *clusters_to_alloc,
|
|
|
|
|
unsigned int *extents_to_split)
|
2007-02-10 12:24:12 +08:00
|
|
|
|
{
|
2007-05-15 09:09:54 +08:00
|
|
|
|
int ret;
|
2007-05-09 08:47:32 +08:00
|
|
|
|
struct ocfs2_write_cluster_desc *desc;
|
2007-05-15 09:09:54 +08:00
|
|
|
|
unsigned int num_clusters = 0;
|
2007-06-19 02:22:56 +08:00
|
|
|
|
unsigned int ext_flags = 0;
|
2007-05-15 09:09:54 +08:00
|
|
|
|
u32 phys = 0;
|
|
|
|
|
int i;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
|
2007-06-19 02:22:56 +08:00
|
|
|
|
*clusters_to_alloc = 0;
|
|
|
|
|
*extents_to_split = 0;
|
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
for (i = 0; i < wc->w_clen; i++) {
|
|
|
|
|
desc = &wc->w_desc[i];
|
|
|
|
|
desc->c_cpos = wc->w_cpos + i;
|
|
|
|
|
|
|
|
|
|
if (num_clusters == 0) {
|
2007-06-19 02:22:56 +08:00
|
|
|
|
/*
|
|
|
|
|
* Need to look up the next extent record.
|
|
|
|
|
*/
|
2007-05-09 08:47:32 +08:00
|
|
|
|
ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys,
|
2007-06-19 02:22:56 +08:00
|
|
|
|
&num_clusters, &ext_flags);
|
2007-05-09 08:47:32 +08:00
|
|
|
|
if (ret) {
|
|
|
|
|
mlog_errno(ret);
|
2007-05-10 06:14:45 +08:00
|
|
|
|
goto out;
|
2007-05-09 08:47:32 +08:00
|
|
|
|
}
|
2007-06-19 02:22:56 +08:00
|
|
|
|
|
2009-08-25 08:02:48 +08:00
|
|
|
|
/* We should already CoW the refcountd extent. */
|
|
|
|
|
BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
|
|
|
|
|
|
2007-06-19 02:22:56 +08:00
|
|
|
|
/*
|
|
|
|
|
* Assume worst case - that we're writing in
|
|
|
|
|
* the middle of the extent.
|
|
|
|
|
*
|
|
|
|
|
* We can assume that the write proceeds from
|
|
|
|
|
* left to right, in which case the extent
|
|
|
|
|
* insert code is smart enough to coalesce the
|
|
|
|
|
* next splits into the previous records created.
|
|
|
|
|
*/
|
|
|
|
|
if (ext_flags & OCFS2_EXT_UNWRITTEN)
|
|
|
|
|
*extents_to_split = *extents_to_split + 2;
|
2007-05-09 08:47:32 +08:00
|
|
|
|
} else if (phys) {
|
|
|
|
|
/*
|
|
|
|
|
* Only increment phys if it doesn't describe
|
|
|
|
|
* a hole.
|
|
|
|
|
*/
|
|
|
|
|
phys++;
|
|
|
|
|
}
|
|
|
|
|
|
2009-08-07 07:12:58 +08:00
|
|
|
|
/*
|
|
|
|
|
* If w_first_new_cpos is < UINT_MAX, we have a non-sparse
|
|
|
|
|
* file that got extended. w_first_new_cpos tells us
|
|
|
|
|
* where the newly allocated clusters are so we can
|
|
|
|
|
* zero them.
|
|
|
|
|
*/
|
|
|
|
|
if (desc->c_cpos >= wc->w_first_new_cpos) {
|
|
|
|
|
BUG_ON(phys == 0);
|
|
|
|
|
desc->c_needs_zero = 1;
|
|
|
|
|
}
|
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
desc->c_phys = phys;
|
|
|
|
|
if (phys == 0) {
|
|
|
|
|
desc->c_new = 1;
|
2009-08-07 07:12:58 +08:00
|
|
|
|
desc->c_needs_zero = 1;
|
2016-03-26 05:20:55 +08:00
|
|
|
|
desc->c_clear_unwritten = 1;
|
2007-05-15 09:09:54 +08:00
|
|
|
|
*clusters_to_alloc = *clusters_to_alloc + 1;
|
2007-05-09 08:47:32 +08:00
|
|
|
|
}
|
2009-08-07 07:12:58 +08:00
|
|
|
|
|
|
|
|
|
if (ext_flags & OCFS2_EXT_UNWRITTEN) {
|
2016-03-26 05:20:55 +08:00
|
|
|
|
desc->c_clear_unwritten = 1;
|
2009-08-07 07:12:58 +08:00
|
|
|
|
desc->c_needs_zero = 1;
|
|
|
|
|
}
|
2007-05-09 08:47:32 +08:00
|
|
|
|
|
2016-03-26 05:21:06 +08:00
|
|
|
|
ret = ocfs2_unwritten_check(inode, wc, desc);
|
|
|
|
|
if (ret) {
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
num_clusters--;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
2007-05-15 09:09:54 +08:00
|
|
|
|
ret = 0;
|
|
|
|
|
out:
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2007-09-08 05:46:51 +08:00
|
|
|
|
static int ocfs2_write_begin_inline(struct address_space *mapping,
|
|
|
|
|
struct inode *inode,
|
|
|
|
|
struct ocfs2_write_ctxt *wc)
|
|
|
|
|
{
|
|
|
|
|
int ret;
|
|
|
|
|
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
|
|
|
|
|
struct page *page;
|
|
|
|
|
handle_t *handle;
|
|
|
|
|
struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
|
|
|
|
|
|
2014-10-10 06:25:15 +08:00
|
|
|
|
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
|
|
|
|
|
if (IS_ERR(handle)) {
|
|
|
|
|
ret = PTR_ERR(handle);
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
2007-09-08 05:46:51 +08:00
|
|
|
|
page = find_or_create_page(mapping, 0, GFP_NOFS);
|
|
|
|
|
if (!page) {
|
2014-10-10 06:25:15 +08:00
|
|
|
|
ocfs2_commit_trans(osb, handle);
|
2007-09-08 05:46:51 +08:00
|
|
|
|
ret = -ENOMEM;
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
/*
|
|
|
|
|
* If we don't set w_num_pages then this page won't get unlocked
|
|
|
|
|
* and freed on cleanup of the write context.
|
|
|
|
|
*/
|
|
|
|
|
wc->w_pages[0] = wc->w_target_page = page;
|
|
|
|
|
wc->w_num_pages = 1;
|
|
|
|
|
|
2009-02-13 08:41:25 +08:00
|
|
|
|
ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
|
2008-10-18 10:25:01 +08:00
|
|
|
|
OCFS2_JOURNAL_ACCESS_WRITE);
|
2007-09-08 05:46:51 +08:00
|
|
|
|
if (ret) {
|
|
|
|
|
ocfs2_commit_trans(osb, handle);
|
|
|
|
|
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
|
|
|
|
|
ocfs2_set_inode_data_inline(inode, di);
|
|
|
|
|
|
|
|
|
|
if (!PageUptodate(page)) {
|
|
|
|
|
ret = ocfs2_read_inline_data(inode, page, wc->w_di_bh);
|
|
|
|
|
if (ret) {
|
|
|
|
|
ocfs2_commit_trans(osb, handle);
|
|
|
|
|
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
wc->w_handle = handle;
|
|
|
|
|
out:
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size)
|
|
|
|
|
{
|
|
|
|
|
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
|
|
|
|
|
|
2007-11-21 03:48:41 +08:00
|
|
|
|
if (new_size <= le16_to_cpu(di->id2.i_data.id_count))
|
2007-09-08 05:46:51 +08:00
|
|
|
|
return 1;
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
|
|
|
|
|
struct inode *inode, loff_t pos,
|
|
|
|
|
unsigned len, struct page *mmap_page,
|
|
|
|
|
struct ocfs2_write_ctxt *wc)
|
|
|
|
|
{
|
|
|
|
|
int ret, written = 0;
|
|
|
|
|
loff_t end = pos + len;
|
|
|
|
|
struct ocfs2_inode_info *oi = OCFS2_I(inode);
|
2009-03-05 11:06:15 +08:00
|
|
|
|
struct ocfs2_dinode *di = NULL;
|
2007-09-08 05:46:51 +08:00
|
|
|
|
|
2011-02-22 21:33:59 +08:00
|
|
|
|
trace_ocfs2_try_to_write_inline_data((unsigned long long)oi->ip_blkno,
|
|
|
|
|
len, (unsigned long long)pos,
|
|
|
|
|
oi->ip_dyn_features);
|
2007-09-08 05:46:51 +08:00
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Handle inodes which already have inline data 1st.
|
|
|
|
|
*/
|
|
|
|
|
if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
|
|
|
|
|
if (mmap_page == NULL &&
|
|
|
|
|
ocfs2_size_fits_inline_data(wc->w_di_bh, end))
|
|
|
|
|
goto do_inline_write;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* The write won't fit - we have to give this inode an
|
|
|
|
|
* inline extent list now.
|
|
|
|
|
*/
|
|
|
|
|
ret = ocfs2_convert_inline_data_to_extents(inode, wc->w_di_bh);
|
|
|
|
|
if (ret)
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Check whether the inode can accept inline data.
|
|
|
|
|
*/
|
|
|
|
|
if (oi->ip_clusters != 0 || i_size_read(inode) != 0)
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Check whether the write can fit.
|
|
|
|
|
*/
|
2009-03-05 11:06:15 +08:00
|
|
|
|
di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
|
|
|
|
|
if (mmap_page ||
|
|
|
|
|
end > ocfs2_max_inline_data_with_xattr(inode->i_sb, di))
|
2007-09-08 05:46:51 +08:00
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
do_inline_write:
|
|
|
|
|
ret = ocfs2_write_begin_inline(mapping, inode, wc);
|
|
|
|
|
if (ret) {
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* This signals to the caller that the data can be written
|
|
|
|
|
* inline.
|
|
|
|
|
*/
|
|
|
|
|
written = 1;
|
|
|
|
|
out:
|
|
|
|
|
return written ? written : ret;
|
|
|
|
|
}
|
|
|
|
|
|
2007-08-29 08:13:23 +08:00
|
|
|
|
/*
|
|
|
|
|
* This function only does anything for file systems which can't
|
|
|
|
|
* handle sparse files.
|
|
|
|
|
*
|
|
|
|
|
* What we want to do here is fill in any hole between the current end
|
|
|
|
|
* of allocation and the end of our write. That way the rest of the
|
|
|
|
|
* write path can treat it as an non-allocating write, which has no
|
|
|
|
|
* special case code for sparse/nonsparse files.
|
|
|
|
|
*/
|
2010-07-02 06:13:31 +08:00
|
|
|
|
static int ocfs2_expand_nonsparse_inode(struct inode *inode,
|
|
|
|
|
struct buffer_head *di_bh,
|
|
|
|
|
loff_t pos, unsigned len,
|
2007-08-29 08:13:23 +08:00
|
|
|
|
struct ocfs2_write_ctxt *wc)
|
|
|
|
|
{
|
|
|
|
|
int ret;
|
|
|
|
|
loff_t newsize = pos + len;
|
|
|
|
|
|
2010-07-02 06:13:31 +08:00
|
|
|
|
BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
|
2007-08-29 08:13:23 +08:00
|
|
|
|
|
|
|
|
|
if (newsize <= i_size_read(inode))
|
|
|
|
|
return 0;
|
|
|
|
|
|
2010-07-02 06:13:31 +08:00
|
|
|
|
ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos);
|
2007-08-29 08:13:23 +08:00
|
|
|
|
if (ret)
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
|
2016-03-26 05:21:01 +08:00
|
|
|
|
/* There is no wc if this is call from direct. */
|
|
|
|
|
if (wc)
|
|
|
|
|
wc->w_first_new_cpos =
|
|
|
|
|
ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
|
2009-08-07 07:12:58 +08:00
|
|
|
|
|
2007-08-29 08:13:23 +08:00
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2010-07-02 06:13:31 +08:00
|
|
|
|
static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
|
|
|
|
|
loff_t pos)
|
|
|
|
|
{
|
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
|
|
BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
|
|
|
|
|
if (pos > i_size_read(inode))
|
|
|
|
|
ret = ocfs2_zero_extend(inode, di_bh, pos);
|
|
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2016-03-26 05:20:52 +08:00
|
|
|
|
int ocfs2_write_begin_nolock(struct address_space *mapping,
|
|
|
|
|
loff_t pos, unsigned len, ocfs2_write_type_t type,
|
2007-05-15 09:09:54 +08:00
|
|
|
|
struct page **pagep, void **fsdata,
|
|
|
|
|
struct buffer_head *di_bh, struct page *mmap_page)
|
|
|
|
|
{
|
2009-08-07 07:12:58 +08:00
|
|
|
|
int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
|
2010-11-04 15:14:11 +08:00
|
|
|
|
unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0;
|
2007-05-15 09:09:54 +08:00
|
|
|
|
struct ocfs2_write_ctxt *wc;
|
|
|
|
|
struct inode *inode = mapping->host;
|
|
|
|
|
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
|
|
|
|
|
struct ocfs2_dinode *di;
|
|
|
|
|
struct ocfs2_alloc_context *data_ac = NULL;
|
|
|
|
|
struct ocfs2_alloc_context *meta_ac = NULL;
|
|
|
|
|
handle_t *handle;
|
2008-08-21 10:36:33 +08:00
|
|
|
|
struct ocfs2_extent_tree et;
|
2010-11-04 15:14:11 +08:00
|
|
|
|
int try_free = 1, ret1;
|
2007-05-15 09:09:54 +08:00
|
|
|
|
|
2010-11-04 15:14:11 +08:00
|
|
|
|
try_again:
|
2016-03-26 05:20:52 +08:00
|
|
|
|
ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, type, di_bh);
|
2007-05-15 09:09:54 +08:00
|
|
|
|
if (ret) {
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2007-09-08 05:46:51 +08:00
|
|
|
|
if (ocfs2_supports_inline_data(osb)) {
|
|
|
|
|
ret = ocfs2_try_to_write_inline_data(mapping, inode, pos, len,
|
|
|
|
|
mmap_page, wc);
|
|
|
|
|
if (ret == 1) {
|
|
|
|
|
ret = 0;
|
|
|
|
|
goto success;
|
|
|
|
|
}
|
|
|
|
|
if (ret < 0) {
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-03-26 05:21:01 +08:00
|
|
|
|
/* Direct io change i_size late, should not zero tail here. */
|
|
|
|
|
if (type != OCFS2_WRITE_DIRECT) {
|
|
|
|
|
if (ocfs2_sparse_alloc(osb))
|
|
|
|
|
ret = ocfs2_zero_tail(inode, di_bh, pos);
|
|
|
|
|
else
|
|
|
|
|
ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
|
|
|
|
|
len, wc);
|
|
|
|
|
if (ret) {
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
2007-08-29 08:13:23 +08:00
|
|
|
|
}
|
|
|
|
|
|
2009-08-25 08:02:48 +08:00
|
|
|
|
ret = ocfs2_check_range_for_refcount(inode, pos, len);
|
|
|
|
|
if (ret < 0) {
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
goto out;
|
|
|
|
|
} else if (ret == 1) {
|
2010-11-04 15:14:11 +08:00
|
|
|
|
clusters_need = wc->w_clen;
|
2013-08-14 07:00:58 +08:00
|
|
|
|
ret = ocfs2_refcount_cow(inode, di_bh,
|
2009-08-26 09:47:28 +08:00
|
|
|
|
wc->w_cpos, wc->w_clen, UINT_MAX);
|
2009-08-25 08:02:48 +08:00
|
|
|
|
if (ret) {
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2007-06-19 02:22:56 +08:00
|
|
|
|
ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
|
|
|
|
|
&extents_to_split);
|
2007-05-15 09:09:54 +08:00
|
|
|
|
if (ret) {
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
2010-11-04 15:14:11 +08:00
|
|
|
|
clusters_need += clusters_to_alloc;
|
2007-05-15 09:09:54 +08:00
|
|
|
|
|
|
|
|
|
di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
|
|
|
|
|
|
2011-02-22 21:33:59 +08:00
|
|
|
|
trace_ocfs2_write_begin_nolock(
|
|
|
|
|
(unsigned long long)OCFS2_I(inode)->ip_blkno,
|
|
|
|
|
(long long)i_size_read(inode),
|
|
|
|
|
le32_to_cpu(di->i_clusters),
|
2016-03-26 05:20:52 +08:00
|
|
|
|
pos, len, type, mmap_page,
|
2011-02-22 21:33:59 +08:00
|
|
|
|
clusters_to_alloc, extents_to_split);
|
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
/*
|
|
|
|
|
* We set w_target_from, w_target_to here so that
|
|
|
|
|
* ocfs2_write_end() knows which range in the target page to
|
|
|
|
|
* write out. An allocation requires that we write the entire
|
|
|
|
|
* cluster range.
|
|
|
|
|
*/
|
2007-06-19 02:22:56 +08:00
|
|
|
|
if (clusters_to_alloc || extents_to_split) {
|
2007-05-09 08:47:32 +08:00
|
|
|
|
/*
|
|
|
|
|
* XXX: We are stretching the limits of
|
2007-06-19 02:22:56 +08:00
|
|
|
|
* ocfs2_lock_allocators(). It greatly over-estimates
|
2007-05-09 08:47:32 +08:00
|
|
|
|
* the work to be done.
|
|
|
|
|
*/
|
2009-02-13 19:54:22 +08:00
|
|
|
|
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
|
|
|
|
|
wc->w_di_bh);
|
2008-08-21 10:36:33 +08:00
|
|
|
|
ret = ocfs2_lock_allocators(inode, &et,
|
2008-08-18 17:38:42 +08:00
|
|
|
|
clusters_to_alloc, extents_to_split,
|
2008-08-21 10:36:33 +08:00
|
|
|
|
&data_ac, &meta_ac);
|
2007-02-10 12:24:12 +08:00
|
|
|
|
if (ret) {
|
|
|
|
|
mlog_errno(ret);
|
2007-05-10 06:14:45 +08:00
|
|
|
|
goto out;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
2009-12-08 05:15:40 +08:00
|
|
|
|
if (data_ac)
|
|
|
|
|
data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
|
|
|
|
|
|
2008-08-18 17:38:43 +08:00
|
|
|
|
credits = ocfs2_calc_extend_credits(inode->i_sb,
|
2013-11-13 07:06:52 +08:00
|
|
|
|
&di->id2.i_list);
|
2016-03-26 05:21:01 +08:00
|
|
|
|
} else if (type == OCFS2_WRITE_DIRECT)
|
|
|
|
|
/* direct write needs not to start trans if no extents alloc. */
|
|
|
|
|
goto success;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
|
2009-08-07 07:12:58 +08:00
|
|
|
|
/*
|
|
|
|
|
* We have to zero sparse allocated clusters, unwritten extent clusters,
|
|
|
|
|
* and non-sparse clusters we just extended. For non-sparse writes,
|
|
|
|
|
* we know zeros will only be needed in the first and/or last cluster.
|
|
|
|
|
*/
|
2016-03-26 05:21:06 +08:00
|
|
|
|
if (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
|
|
|
|
|
wc->w_desc[wc->w_clen - 1].c_needs_zero))
|
2009-08-07 07:12:58 +08:00
|
|
|
|
cluster_of_pages = 1;
|
|
|
|
|
else
|
|
|
|
|
cluster_of_pages = 0;
|
|
|
|
|
|
|
|
|
|
ocfs2_set_target_boundaries(osb, wc, pos, len, cluster_of_pages);
|
2007-05-09 08:47:32 +08:00
|
|
|
|
|
2007-02-10 12:24:12 +08:00
|
|
|
|
handle = ocfs2_start_trans(osb, credits);
|
|
|
|
|
if (IS_ERR(handle)) {
|
|
|
|
|
ret = PTR_ERR(handle);
|
|
|
|
|
mlog_errno(ret);
|
2007-05-10 06:14:45 +08:00
|
|
|
|
goto out;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
wc->w_handle = handle;
|
|
|
|
|
|
2010-03-03 22:05:00 +08:00
|
|
|
|
if (clusters_to_alloc) {
|
|
|
|
|
ret = dquot_alloc_space_nodirty(inode,
|
|
|
|
|
ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
|
|
|
|
|
if (ret)
|
|
|
|
|
goto out_commit;
|
2008-10-10 01:38:40 +08:00
|
|
|
|
}
|
ocfs2: call ocfs2_journal_access_di() before ocfs2_journal_dirty() in ocfs2_write_end_nolock()
1: After we call ocfs2_journal_access_di() in ocfs2_write_begin(),
jbd2_journal_restart() may also be called, in this function transaction
A's t_updates-- and obtains a new transaction B. If
jbd2_journal_commit_transaction() is happened to commit transaction A,
when t_updates==0, it will continue to complete commit and unfile
buffer.
So when jbd2_journal_dirty_metadata(), the handle is pointed a new
transaction B, and the buffer head's journal head is already freed,
jh->b_transaction == NULL, jh->b_next_transaction == NULL, it returns
EINVAL, So it triggers the BUG_ON(status).
thread 1 jbd2
ocfs2_write_begin jbd2_journal_commit_transaction
ocfs2_write_begin_nolock
ocfs2_start_trans
jbd2__journal_start(t_updates+1,
transaction A)
ocfs2_journal_access_di
ocfs2_write_cluster_by_desc
ocfs2_mark_extent_written
ocfs2_change_extent_flag
ocfs2_split_extent
ocfs2_extend_rotate_transaction
jbd2_journal_restart
(t_updates-1,transaction B) t_updates==0
__jbd2_journal_refile_buffer
(jh->b_transaction = NULL)
ocfs2_write_end
ocfs2_write_end_nolock
ocfs2_journal_dirty
jbd2_journal_dirty_metadata(bug)
ocfs2_commit_trans
2. In ext4, I found that: jbd2_journal_get_write_access() called by
ext4_write_end.
ext4_write_begin
ext4_journal_start
__ext4_journal_start_sb
ext4_journal_check_start
jbd2__journal_start
ext4_write_end
ext4_mark_inode_dirty
ext4_reserve_inode_write
ext4_journal_get_write_access
jbd2_journal_get_write_access
ext4_mark_iloc_dirty
ext4_do_update_inode
ext4_handle_dirty_metadata
jbd2_journal_dirty_metadata
3. So I think we should put ocfs2_journal_access_di before
ocfs2_journal_dirty in the ocfs2_write_end. and it works well after my
modification.
Signed-off-by: vicky <vicky.yangwenfang@huawei.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Zhangguanghui <zhang.guanghui@h3c.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-05 06:44:45 +08:00
|
|
|
|
|
2009-02-13 08:41:25 +08:00
|
|
|
|
ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
|
2008-10-18 10:25:01 +08:00
|
|
|
|
OCFS2_JOURNAL_ACCESS_WRITE);
|
2007-05-09 08:47:32 +08:00
|
|
|
|
if (ret) {
|
2007-02-10 12:24:12 +08:00
|
|
|
|
mlog_errno(ret);
|
2008-10-10 01:38:40 +08:00
|
|
|
|
goto out_quota;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
/*
|
|
|
|
|
* Fill our page array first. That way we've grabbed enough so
|
|
|
|
|
* that we can zero and flush if we error after adding the
|
|
|
|
|
* extent.
|
|
|
|
|
*/
|
2010-07-03 08:20:27 +08:00
|
|
|
|
ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
|
2009-08-07 07:12:58 +08:00
|
|
|
|
cluster_of_pages, mmap_page);
|
2022-01-15 06:03:38 +08:00
|
|
|
|
if (ret) {
|
|
|
|
|
/*
|
|
|
|
|
* ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock
|
|
|
|
|
* the target page. In this case, we exit with no error and no target
|
|
|
|
|
* page. This will trigger the caller, page_mkwrite(), to re-try
|
|
|
|
|
* the operation.
|
|
|
|
|
*/
|
|
|
|
|
if (type == OCFS2_WRITE_MMAP && ret == -EAGAIN) {
|
|
|
|
|
BUG_ON(wc->w_target_page);
|
|
|
|
|
ret = 0;
|
|
|
|
|
goto out_quota;
|
|
|
|
|
}
|
2007-02-10 12:24:12 +08:00
|
|
|
|
|
2022-01-15 06:03:38 +08:00
|
|
|
|
mlog_errno(ret);
|
2011-07-25 01:36:54 +08:00
|
|
|
|
goto out_quota;
|
|
|
|
|
}
|
|
|
|
|
|
2007-05-15 09:09:54 +08:00
|
|
|
|
ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
|
|
|
|
|
len);
|
|
|
|
|
if (ret) {
|
|
|
|
|
mlog_errno(ret);
|
2008-10-10 01:38:40 +08:00
|
|
|
|
goto out_quota;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
if (data_ac)
|
|
|
|
|
ocfs2_free_alloc_context(data_ac);
|
|
|
|
|
if (meta_ac)
|
|
|
|
|
ocfs2_free_alloc_context(meta_ac);
|
2007-02-10 12:24:12 +08:00
|
|
|
|
|
2007-09-08 05:46:51 +08:00
|
|
|
|
success:
|
2016-03-26 05:20:58 +08:00
|
|
|
|
if (pagep)
|
|
|
|
|
*pagep = wc->w_target_page;
|
2007-05-09 08:47:32 +08:00
|
|
|
|
*fsdata = wc;
|
|
|
|
|
return 0;
|
2008-10-10 01:38:40 +08:00
|
|
|
|
out_quota:
|
|
|
|
|
if (clusters_to_alloc)
|
2010-03-03 22:05:00 +08:00
|
|
|
|
dquot_free_space(inode,
|
2008-10-10 01:38:40 +08:00
|
|
|
|
ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
|
2007-02-10 12:24:12 +08:00
|
|
|
|
out_commit:
|
|
|
|
|
ocfs2_commit_trans(osb, handle);
|
|
|
|
|
|
|
|
|
|
out:
|
ocfs2: fix deadlock on mmapped page in ocfs2_write_begin_nolock()
The testcase "mmaptruncate" of ocfs2-test deadlocks occasionally.
In this testcase, we create a 2*CLUSTER_SIZE file and mmap() on it;
there are 2 process repeatedly performing the following operations
respectively: one is doing memset(mmaped_addr + 2*CLUSTER_SIZE - 1, 'a',
1), while the another is playing ftruncate(fd, 2*CLUSTER_SIZE) and then
ftruncate(fd, CLUSTER_SIZE) again and again.
This is the backtrace when the deadlock happens:
__wait_on_bit_lock+0x50/0xa0
__lock_page+0xb7/0xc0
ocfs2_write_begin_nolock+0x163f/0x1790 [ocfs2]
ocfs2_page_mkwrite+0x1c7/0x2a0 [ocfs2]
do_page_mkwrite+0x66/0xc0
handle_mm_fault+0x685/0x1350
__do_page_fault+0x1d8/0x4d0
trace_do_page_fault+0x37/0xf0
do_async_page_fault+0x19/0x70
async_page_fault+0x28/0x30
In ocfs2_write_begin_nolock(), we first grab the pages and then allocate
disk space for this write; ocfs2_try_to_free_truncate_log() will be
called if -ENOSPC is returned; if we're lucky to get enough clusters,
which is usually the case, we start over again.
But in ocfs2_free_write_ctxt() the target page isn't unlocked, so we
will deadlock when trying to grab the target page again.
Also, -ENOMEM might be returned in ocfs2_grab_pages_for_write().
Another deadlock will happen in __do_page_mkwrite() if
ocfs2_page_mkwrite() returns non-VM_FAULT_LOCKED, and along with a
locked target page.
These two errors fail on the same path, so fix them by unlocking the
target page manually before ocfs2_free_write_ctxt().
Jan Kara helps me clear out the JBD2 part, and suggest the hint for root
cause.
Changes since v1:
1. Also put ENOMEM error case into consideration.
Link: http://lkml.kernel.org/r/1474173902-32075-1-git-send-email-zren@suse.com
Signed-off-by: Eric Ren <zren@suse.com>
Reviewed-by: He Gang <ghe@suse.com>
Acked-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-10-01 06:11:32 +08:00
|
|
|
|
/*
|
|
|
|
|
* The mmapped page won't be unlocked in ocfs2_free_write_ctxt(),
|
|
|
|
|
* even in case of error here like ENOSPC and ENOMEM. So, we need
|
|
|
|
|
* to unlock the target page manually to prevent deadlocks when
|
|
|
|
|
* retrying again on ENOSPC, or when returning non-VM_FAULT_LOCKED
|
|
|
|
|
* to VM code.
|
|
|
|
|
*/
|
|
|
|
|
if (wc->w_target_locked)
|
|
|
|
|
unlock_page(mmap_page);
|
|
|
|
|
|
2016-03-26 05:21:06 +08:00
|
|
|
|
ocfs2_free_write_ctxt(inode, wc);
|
2007-05-09 08:47:32 +08:00
|
|
|
|
|
2013-11-13 07:07:06 +08:00
|
|
|
|
if (data_ac) {
|
2007-02-10 12:24:12 +08:00
|
|
|
|
ocfs2_free_alloc_context(data_ac);
|
2013-11-13 07:07:06 +08:00
|
|
|
|
data_ac = NULL;
|
|
|
|
|
}
|
|
|
|
|
if (meta_ac) {
|
2007-02-10 12:24:12 +08:00
|
|
|
|
ocfs2_free_alloc_context(meta_ac);
|
2013-11-13 07:07:06 +08:00
|
|
|
|
meta_ac = NULL;
|
|
|
|
|
}
|
2010-11-04 15:14:11 +08:00
|
|
|
|
|
|
|
|
|
if (ret == -ENOSPC && try_free) {
|
|
|
|
|
/*
|
|
|
|
|
* Try to free some truncate log so that we can have enough
|
|
|
|
|
* clusters to allocate.
|
|
|
|
|
*/
|
|
|
|
|
try_free = 0;
|
|
|
|
|
|
|
|
|
|
ret1 = ocfs2_try_to_free_truncate_log(osb, clusters_need);
|
|
|
|
|
if (ret1 == 1)
|
|
|
|
|
goto try_again;
|
|
|
|
|
|
|
|
|
|
if (ret1 < 0)
|
|
|
|
|
mlog_errno(ret1);
|
|
|
|
|
}
|
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2007-10-16 16:25:24 +08:00
|
|
|
|
static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
|
2022-02-23 03:31:43 +08:00
|
|
|
|
loff_t pos, unsigned len,
|
2007-10-16 16:25:24 +08:00
|
|
|
|
struct page **pagep, void **fsdata)
|
2007-05-10 06:14:45 +08:00
|
|
|
|
{
|
|
|
|
|
int ret;
|
|
|
|
|
struct buffer_head *di_bh = NULL;
|
|
|
|
|
struct inode *inode = mapping->host;
|
|
|
|
|
|
2007-10-19 06:30:42 +08:00
|
|
|
|
ret = ocfs2_inode_lock(inode, &di_bh, 1);
|
2007-05-10 06:14:45 +08:00
|
|
|
|
if (ret) {
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Take alloc sem here to prevent concurrent lookups. That way
|
|
|
|
|
* the mapping, zeroing and tree manipulation within
|
2022-04-29 22:40:40 +08:00
|
|
|
|
* ocfs2_write() will be safe against ->read_folio(). This
|
2007-05-10 06:14:45 +08:00
|
|
|
|
* should also serve to lock out allocation from a shared
|
|
|
|
|
* writeable region.
|
|
|
|
|
*/
|
|
|
|
|
down_write(&OCFS2_I(inode)->ip_alloc_sem);
|
|
|
|
|
|
2016-03-26 05:20:52 +08:00
|
|
|
|
ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_BUFFER,
|
|
|
|
|
pagep, fsdata, di_bh, NULL);
|
2007-05-10 06:14:45 +08:00
|
|
|
|
if (ret) {
|
|
|
|
|
mlog_errno(ret);
|
2007-10-19 06:23:46 +08:00
|
|
|
|
goto out_fail;
|
2007-05-10 06:14:45 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
brelse(di_bh);
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
out_fail:
|
|
|
|
|
up_write(&OCFS2_I(inode)->ip_alloc_sem);
|
|
|
|
|
|
|
|
|
|
brelse(di_bh);
|
2007-10-19 06:30:42 +08:00
|
|
|
|
ocfs2_inode_unlock(inode, 1);
|
2007-05-10 06:14:45 +08:00
|
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2007-09-08 05:46:51 +08:00
|
|
|
|
static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
|
|
|
|
|
unsigned len, unsigned *copied,
|
|
|
|
|
struct ocfs2_dinode *di,
|
|
|
|
|
struct ocfs2_write_ctxt *wc)
|
|
|
|
|
{
|
|
|
|
|
void *kaddr;
|
|
|
|
|
|
|
|
|
|
if (unlikely(*copied < len)) {
|
|
|
|
|
if (!PageUptodate(wc->w_target_page)) {
|
|
|
|
|
*copied = 0;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2011-11-25 23:14:34 +08:00
|
|
|
|
kaddr = kmap_atomic(wc->w_target_page);
|
2007-09-08 05:46:51 +08:00
|
|
|
|
memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied);
|
2011-11-25 23:14:34 +08:00
|
|
|
|
kunmap_atomic(kaddr);
|
2007-09-08 05:46:51 +08:00
|
|
|
|
|
2011-02-22 21:33:59 +08:00
|
|
|
|
trace_ocfs2_write_end_inline(
|
|
|
|
|
(unsigned long long)OCFS2_I(inode)->ip_blkno,
|
2007-09-08 05:46:51 +08:00
|
|
|
|
(unsigned long long)pos, *copied,
|
|
|
|
|
le16_to_cpu(di->id2.i_data.id_count),
|
|
|
|
|
le16_to_cpu(di->i_dyn_features));
|
|
|
|
|
}
|
|
|
|
|
|
2007-05-10 06:16:19 +08:00
|
|
|
|
int ocfs2_write_end_nolock(struct address_space *mapping,
|
2016-12-13 08:41:17 +08:00
|
|
|
|
loff_t pos, unsigned len, unsigned copied, void *fsdata)
|
2007-05-09 08:47:32 +08:00
|
|
|
|
{
|
ocfs2: call ocfs2_journal_access_di() before ocfs2_journal_dirty() in ocfs2_write_end_nolock()
1: After we call ocfs2_journal_access_di() in ocfs2_write_begin(),
jbd2_journal_restart() may also be called, in this function transaction
A's t_updates-- and obtains a new transaction B. If
jbd2_journal_commit_transaction() is happened to commit transaction A,
when t_updates==0, it will continue to complete commit and unfile
buffer.
So when jbd2_journal_dirty_metadata(), the handle is pointed a new
transaction B, and the buffer head's journal head is already freed,
jh->b_transaction == NULL, jh->b_next_transaction == NULL, it returns
EINVAL, So it triggers the BUG_ON(status).
thread 1 jbd2
ocfs2_write_begin jbd2_journal_commit_transaction
ocfs2_write_begin_nolock
ocfs2_start_trans
jbd2__journal_start(t_updates+1,
transaction A)
ocfs2_journal_access_di
ocfs2_write_cluster_by_desc
ocfs2_mark_extent_written
ocfs2_change_extent_flag
ocfs2_split_extent
ocfs2_extend_rotate_transaction
jbd2_journal_restart
(t_updates-1,transaction B) t_updates==0
__jbd2_journal_refile_buffer
(jh->b_transaction = NULL)
ocfs2_write_end
ocfs2_write_end_nolock
ocfs2_journal_dirty
jbd2_journal_dirty_metadata(bug)
ocfs2_commit_trans
2. In ext4, I found that: jbd2_journal_get_write_access() called by
ext4_write_end.
ext4_write_begin
ext4_journal_start
__ext4_journal_start_sb
ext4_journal_check_start
jbd2__journal_start
ext4_write_end
ext4_mark_inode_dirty
ext4_reserve_inode_write
ext4_journal_get_write_access
jbd2_journal_get_write_access
ext4_mark_iloc_dirty
ext4_do_update_inode
ext4_handle_dirty_metadata
jbd2_journal_dirty_metadata
3. So I think we should put ocfs2_journal_access_di before
ocfs2_journal_dirty in the ocfs2_write_end. and it works well after my
modification.
Signed-off-by: vicky <vicky.yangwenfang@huawei.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Zhangguanghui <zhang.guanghui@h3c.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-05 06:44:45 +08:00
|
|
|
|
int i, ret;
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
|
unsigned from, to, start = pos & (PAGE_SIZE - 1);
|
2007-05-09 08:47:32 +08:00
|
|
|
|
struct inode *inode = mapping->host;
|
|
|
|
|
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
|
|
|
|
|
struct ocfs2_write_ctxt *wc = fsdata;
|
|
|
|
|
struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
|
|
|
|
|
handle_t *handle = wc->w_handle;
|
|
|
|
|
struct page *tmppage;
|
|
|
|
|
|
2016-03-26 05:21:06 +08:00
|
|
|
|
BUG_ON(!list_empty(&wc->w_unwritten_list));
|
|
|
|
|
|
2016-03-26 05:21:01 +08:00
|
|
|
|
if (handle) {
|
|
|
|
|
ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
|
|
|
|
|
wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
|
|
|
|
|
if (ret) {
|
|
|
|
|
copied = ret;
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
ocfs2: call ocfs2_journal_access_di() before ocfs2_journal_dirty() in ocfs2_write_end_nolock()
1: After we call ocfs2_journal_access_di() in ocfs2_write_begin(),
jbd2_journal_restart() may also be called, in this function transaction
A's t_updates-- and obtains a new transaction B. If
jbd2_journal_commit_transaction() is happened to commit transaction A,
when t_updates==0, it will continue to complete commit and unfile
buffer.
So when jbd2_journal_dirty_metadata(), the handle is pointed a new
transaction B, and the buffer head's journal head is already freed,
jh->b_transaction == NULL, jh->b_next_transaction == NULL, it returns
EINVAL, So it triggers the BUG_ON(status).
thread 1 jbd2
ocfs2_write_begin jbd2_journal_commit_transaction
ocfs2_write_begin_nolock
ocfs2_start_trans
jbd2__journal_start(t_updates+1,
transaction A)
ocfs2_journal_access_di
ocfs2_write_cluster_by_desc
ocfs2_mark_extent_written
ocfs2_change_extent_flag
ocfs2_split_extent
ocfs2_extend_rotate_transaction
jbd2_journal_restart
(t_updates-1,transaction B) t_updates==0
__jbd2_journal_refile_buffer
(jh->b_transaction = NULL)
ocfs2_write_end
ocfs2_write_end_nolock
ocfs2_journal_dirty
jbd2_journal_dirty_metadata(bug)
ocfs2_commit_trans
2. In ext4, I found that: jbd2_journal_get_write_access() called by
ext4_write_end.
ext4_write_begin
ext4_journal_start
__ext4_journal_start_sb
ext4_journal_check_start
jbd2__journal_start
ext4_write_end
ext4_mark_inode_dirty
ext4_reserve_inode_write
ext4_journal_get_write_access
jbd2_journal_get_write_access
ext4_mark_iloc_dirty
ext4_do_update_inode
ext4_handle_dirty_metadata
jbd2_journal_dirty_metadata
3. So I think we should put ocfs2_journal_access_di before
ocfs2_journal_dirty in the ocfs2_write_end. and it works well after my
modification.
Signed-off-by: vicky <vicky.yangwenfang@huawei.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Zhangguanghui <zhang.guanghui@h3c.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-05 06:44:45 +08:00
|
|
|
|
}
|
|
|
|
|
|
2007-09-08 05:46:51 +08:00
|
|
|
|
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
|
|
|
|
|
ocfs2_write_end_inline(inode, pos, len, &copied, di, wc);
|
|
|
|
|
goto out_write_size;
|
|
|
|
|
}
|
|
|
|
|
|
2016-03-26 05:20:58 +08:00
|
|
|
|
if (unlikely(copied < len) && wc->w_target_page) {
|
2007-05-09 08:47:32 +08:00
|
|
|
|
if (!PageUptodate(wc->w_target_page))
|
|
|
|
|
copied = 0;
|
|
|
|
|
|
|
|
|
|
ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
|
|
|
|
|
start+len);
|
|
|
|
|
}
|
2016-03-26 05:20:58 +08:00
|
|
|
|
if (wc->w_target_page)
|
|
|
|
|
flush_dcache_page(wc->w_target_page);
|
2007-05-09 08:47:32 +08:00
|
|
|
|
|
|
|
|
|
for(i = 0; i < wc->w_num_pages; i++) {
|
|
|
|
|
tmppage = wc->w_pages[i];
|
|
|
|
|
|
2016-03-26 05:20:58 +08:00
|
|
|
|
/* This is the direct io target page. */
|
|
|
|
|
if (tmppage == NULL)
|
|
|
|
|
continue;
|
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
if (tmppage == wc->w_target_page) {
|
|
|
|
|
from = wc->w_target_from;
|
|
|
|
|
to = wc->w_target_to;
|
|
|
|
|
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
|
BUG_ON(from > PAGE_SIZE ||
|
|
|
|
|
to > PAGE_SIZE ||
|
2007-05-09 08:47:32 +08:00
|
|
|
|
to < from);
|
|
|
|
|
} else {
|
|
|
|
|
/*
|
|
|
|
|
* Pages adjacent to the target (if any) imply
|
|
|
|
|
* a hole-filling write in which case we want
|
|
|
|
|
* to flush their entire range.
|
|
|
|
|
*/
|
|
|
|
|
from = 0;
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
|
to = PAGE_SIZE;
|
2007-05-09 08:47:32 +08:00
|
|
|
|
}
|
|
|
|
|
|
2008-07-17 08:22:22 +08:00
|
|
|
|
if (page_has_buffers(tmppage)) {
|
2019-09-24 06:33:08 +08:00
|
|
|
|
if (handle && ocfs2_should_order_data(inode)) {
|
|
|
|
|
loff_t start_byte =
|
|
|
|
|
((loff_t)tmppage->index << PAGE_SHIFT) +
|
|
|
|
|
from;
|
|
|
|
|
loff_t length = to - from;
|
|
|
|
|
ocfs2_jbd2_inode_add_write(handle, inode,
|
|
|
|
|
start_byte, length);
|
|
|
|
|
}
|
2008-07-17 08:22:22 +08:00
|
|
|
|
block_commit_write(tmppage, from, to);
|
|
|
|
|
}
|
2007-05-09 08:47:32 +08:00
|
|
|
|
}
|
|
|
|
|
|
2007-09-08 05:46:51 +08:00
|
|
|
|
out_write_size:
|
2016-03-26 05:21:01 +08:00
|
|
|
|
/* Direct io do not update i_size here. */
|
|
|
|
|
if (wc->w_type != OCFS2_WRITE_DIRECT) {
|
|
|
|
|
pos += copied;
|
|
|
|
|
if (pos > i_size_read(inode)) {
|
|
|
|
|
i_size_write(inode, pos);
|
|
|
|
|
mark_inode_dirty(inode);
|
|
|
|
|
}
|
|
|
|
|
inode->i_blocks = ocfs2_inode_sector_count(inode);
|
|
|
|
|
di->i_size = cpu_to_le64((u64)i_size_read(inode));
|
2016-09-14 22:48:04 +08:00
|
|
|
|
inode->i_mtime = inode->i_ctime = current_time(inode);
|
2016-03-26 05:21:01 +08:00
|
|
|
|
di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
|
|
|
|
|
di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
|
2019-10-07 08:57:54 +08:00
|
|
|
|
if (handle)
|
|
|
|
|
ocfs2_update_inode_fsync_trans(handle, inode, 1);
|
2016-03-26 05:21:01 +08:00
|
|
|
|
}
|
|
|
|
|
if (handle)
|
|
|
|
|
ocfs2_journal_dirty(handle, wc->w_di_bh);
|
2007-05-09 08:47:32 +08:00
|
|
|
|
|
ocfs2: call ocfs2_journal_access_di() before ocfs2_journal_dirty() in ocfs2_write_end_nolock()
1: After we call ocfs2_journal_access_di() in ocfs2_write_begin(),
jbd2_journal_restart() may also be called, in this function transaction
A's t_updates-- and obtains a new transaction B. If
jbd2_journal_commit_transaction() is happened to commit transaction A,
when t_updates==0, it will continue to complete commit and unfile
buffer.
So when jbd2_journal_dirty_metadata(), the handle is pointed a new
transaction B, and the buffer head's journal head is already freed,
jh->b_transaction == NULL, jh->b_next_transaction == NULL, it returns
EINVAL, So it triggers the BUG_ON(status).
thread 1 jbd2
ocfs2_write_begin jbd2_journal_commit_transaction
ocfs2_write_begin_nolock
ocfs2_start_trans
jbd2__journal_start(t_updates+1,
transaction A)
ocfs2_journal_access_di
ocfs2_write_cluster_by_desc
ocfs2_mark_extent_written
ocfs2_change_extent_flag
ocfs2_split_extent
ocfs2_extend_rotate_transaction
jbd2_journal_restart
(t_updates-1,transaction B) t_updates==0
__jbd2_journal_refile_buffer
(jh->b_transaction = NULL)
ocfs2_write_end
ocfs2_write_end_nolock
ocfs2_journal_dirty
jbd2_journal_dirty_metadata(bug)
ocfs2_commit_trans
2. In ext4, I found that: jbd2_journal_get_write_access() called by
ext4_write_end.
ext4_write_begin
ext4_journal_start
__ext4_journal_start_sb
ext4_journal_check_start
jbd2__journal_start
ext4_write_end
ext4_mark_inode_dirty
ext4_reserve_inode_write
ext4_journal_get_write_access
jbd2_journal_get_write_access
ext4_mark_iloc_dirty
ext4_do_update_inode
ext4_handle_dirty_metadata
jbd2_journal_dirty_metadata
3. So I think we should put ocfs2_journal_access_di before
ocfs2_journal_dirty in the ocfs2_write_end. and it works well after my
modification.
Signed-off-by: vicky <vicky.yangwenfang@huawei.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Zhangguanghui <zhang.guanghui@h3c.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-05 06:44:45 +08:00
|
|
|
|
out:
|
ocfs2: fix journal commit deadlock
For buffer write, page lock will be got in write_begin and released in
write_end, in ocfs2_write_end_nolock(), before it unlock the page in
ocfs2_free_write_ctxt(), it calls ocfs2_run_deallocs(), this will ask
for the read lock of journal->j_trans_barrier. Holding page lock and
ask for journal->j_trans_barrier breaks the locking order.
This will cause a deadlock with journal commit threads, ocfs2cmt will
get write lock of journal->j_trans_barrier first, then it wakes up
kjournald2 to do the commit work, at last it waits until done. To
commit journal, kjournald2 needs flushing data first, it needs get the
cache page lock.
Since some ocfs2 cluster locks are holding by write process, this
deadlock may hung the whole cluster.
unlock pages before ocfs2_run_deallocs() can fix the locking order, also
put unlock before ocfs2_commit_trans() to make page lock is unlocked
before j_trans_barrier to preserve unlocking order.
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Wengang Wang <wen.gang.wang@oracle.com>
Cc: <stable@vger.kernel.org>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-19 08:17:37 +08:00
|
|
|
|
/* unlock pages before dealloc since it needs acquiring j_trans_barrier
|
|
|
|
|
* lock, or it will cause a deadlock since journal commit threads holds
|
|
|
|
|
* this lock and will ask for the page lock when flushing the data.
|
|
|
|
|
* put it here to preserve the unlock order.
|
|
|
|
|
*/
|
|
|
|
|
ocfs2_unlock_pages(wc);
|
|
|
|
|
|
2016-03-26 05:21:01 +08:00
|
|
|
|
if (handle)
|
|
|
|
|
ocfs2_commit_trans(osb, handle);
|
2007-06-23 06:52:36 +08:00
|
|
|
|
|
2007-06-19 02:22:56 +08:00
|
|
|
|
ocfs2_run_deallocs(osb, &wc->w_dealloc);
|
|
|
|
|
|
ocfs2: fix journal commit deadlock
For buffer write, page lock will be got in write_begin and released in
write_end, in ocfs2_write_end_nolock(), before it unlock the page in
ocfs2_free_write_ctxt(), it calls ocfs2_run_deallocs(), this will ask
for the read lock of journal->j_trans_barrier. Holding page lock and
ask for journal->j_trans_barrier breaks the locking order.
This will cause a deadlock with journal commit threads, ocfs2cmt will
get write lock of journal->j_trans_barrier first, then it wakes up
kjournald2 to do the commit work, at last it waits until done. To
commit journal, kjournald2 needs flushing data first, it needs get the
cache page lock.
Since some ocfs2 cluster locks are holding by write process, this
deadlock may hung the whole cluster.
unlock pages before ocfs2_run_deallocs() can fix the locking order, also
put unlock before ocfs2_commit_trans() to make page lock is unlocked
before j_trans_barrier to preserve unlocking order.
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Wengang Wang <wen.gang.wang@oracle.com>
Cc: <stable@vger.kernel.org>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-19 08:17:37 +08:00
|
|
|
|
brelse(wc->w_di_bh);
|
|
|
|
|
kfree(wc);
|
2007-05-10 06:14:45 +08:00
|
|
|
|
|
|
|
|
|
return copied;
|
|
|
|
|
}
|
|
|
|
|
|
2007-10-16 16:25:24 +08:00
|
|
|
|
static int ocfs2_write_end(struct file *file, struct address_space *mapping,
|
|
|
|
|
loff_t pos, unsigned len, unsigned copied,
|
|
|
|
|
struct page *page, void *fsdata)
|
2007-05-10 06:14:45 +08:00
|
|
|
|
{
|
|
|
|
|
int ret;
|
|
|
|
|
struct inode *inode = mapping->host;
|
|
|
|
|
|
2016-12-13 08:41:17 +08:00
|
|
|
|
ret = ocfs2_write_end_nolock(mapping, pos, len, copied, fsdata);
|
2007-05-10 06:14:45 +08:00
|
|
|
|
|
2007-05-09 08:47:32 +08:00
|
|
|
|
up_write(&OCFS2_I(inode)->ip_alloc_sem);
|
2007-10-19 06:30:42 +08:00
|
|
|
|
ocfs2_inode_unlock(inode, 1);
|
2007-02-10 12:24:12 +08:00
|
|
|
|
|
2007-05-10 06:14:45 +08:00
|
|
|
|
return ret;
|
2007-02-10 12:24:12 +08:00
|
|
|
|
}
|
|
|
|
|
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
struct ocfs2_dio_write_ctxt {
|
|
|
|
|
struct list_head dw_zero_list;
|
|
|
|
|
unsigned dw_zero_count;
|
|
|
|
|
int dw_orphaned;
|
|
|
|
|
pid_t dw_writer_pid;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static struct ocfs2_dio_write_ctxt *
|
|
|
|
|
ocfs2_dio_alloc_write_ctx(struct buffer_head *bh, int *alloc)
|
|
|
|
|
{
|
|
|
|
|
struct ocfs2_dio_write_ctxt *dwc = NULL;
|
|
|
|
|
|
|
|
|
|
if (bh->b_private)
|
|
|
|
|
return bh->b_private;
|
|
|
|
|
|
|
|
|
|
dwc = kmalloc(sizeof(struct ocfs2_dio_write_ctxt), GFP_NOFS);
|
|
|
|
|
if (dwc == NULL)
|
|
|
|
|
return NULL;
|
|
|
|
|
INIT_LIST_HEAD(&dwc->dw_zero_list);
|
|
|
|
|
dwc->dw_zero_count = 0;
|
|
|
|
|
dwc->dw_orphaned = 0;
|
|
|
|
|
dwc->dw_writer_pid = task_pid_nr(current);
|
|
|
|
|
bh->b_private = dwc;
|
|
|
|
|
*alloc = 1;
|
|
|
|
|
|
|
|
|
|
return dwc;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ocfs2_dio_free_write_ctx(struct inode *inode,
|
|
|
|
|
struct ocfs2_dio_write_ctxt *dwc)
|
|
|
|
|
{
|
|
|
|
|
ocfs2_free_unwritten_list(inode, &dwc->dw_zero_list);
|
|
|
|
|
kfree(dwc);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* TODO: Make this into a generic get_blocks function.
|
|
|
|
|
*
|
|
|
|
|
* From do_direct_io in direct-io.c:
|
|
|
|
|
* "So what we do is to permit the ->get_blocks function to populate
|
|
|
|
|
* bh.b_size with the size of IO which is permitted at this offset and
|
|
|
|
|
* this i_blkbits."
|
|
|
|
|
*
|
|
|
|
|
* This function is called directly from get_more_blocks in direct-io.c.
|
|
|
|
|
*
|
|
|
|
|
* called like this: dio->get_blocks(dio->inode, fs_startblk,
|
|
|
|
|
* fs_count, map_bh, dio->rw == WRITE);
|
|
|
|
|
*/
|
2017-11-16 09:31:44 +08:00
|
|
|
|
static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
struct buffer_head *bh_result, int create)
|
|
|
|
|
{
|
|
|
|
|
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
|
2016-03-26 05:21:18 +08:00
|
|
|
|
struct ocfs2_inode_info *oi = OCFS2_I(inode);
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
struct ocfs2_write_ctxt *wc;
|
|
|
|
|
struct ocfs2_write_cluster_desc *desc = NULL;
|
|
|
|
|
struct ocfs2_dio_write_ctxt *dwc = NULL;
|
|
|
|
|
struct buffer_head *di_bh = NULL;
|
|
|
|
|
u64 p_blkno;
|
2019-10-07 08:57:47 +08:00
|
|
|
|
unsigned int i_blkbits = inode->i_sb->s_blocksize_bits;
|
|
|
|
|
loff_t pos = iblock << i_blkbits;
|
|
|
|
|
sector_t endblk = (i_size_read(inode) - 1) >> i_blkbits;
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
unsigned len, total_len = bh_result->b_size;
|
|
|
|
|
int ret = 0, first_get_block = 0;
|
|
|
|
|
|
|
|
|
|
len = osb->s_clustersize - (pos & (osb->s_clustersize - 1));
|
|
|
|
|
len = min(total_len, len);
|
|
|
|
|
|
2019-10-07 08:57:47 +08:00
|
|
|
|
/*
|
|
|
|
|
* bh_result->b_size is count in get_more_blocks according to write
|
|
|
|
|
* "pos" and "end", we need map twice to return different buffer state:
|
|
|
|
|
* 1. area in file size, not set NEW;
|
|
|
|
|
* 2. area out file size, set NEW.
|
|
|
|
|
*
|
|
|
|
|
* iblock endblk
|
|
|
|
|
* |--------|---------|---------|---------
|
|
|
|
|
* |<-------area in file------->|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
if ((iblock <= endblk) &&
|
|
|
|
|
((iblock + ((len - 1) >> i_blkbits)) > endblk))
|
|
|
|
|
len = (endblk - iblock + 1) << i_blkbits;
|
|
|
|
|
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
mlog(0, "get block of %lu at %llu:%u req %u\n",
|
|
|
|
|
inode->i_ino, pos, len, total_len);
|
|
|
|
|
|
2016-03-26 05:21:20 +08:00
|
|
|
|
/*
|
|
|
|
|
* Because we need to change file size in ocfs2_dio_end_io_write(), or
|
|
|
|
|
* we may need to add it to orphan dir. So can not fall to fast path
|
|
|
|
|
* while file size will be changed.
|
|
|
|
|
*/
|
|
|
|
|
if (pos + total_len <= i_size_read(inode)) {
|
2016-03-26 05:21:18 +08:00
|
|
|
|
|
2017-11-16 09:31:44 +08:00
|
|
|
|
/* This is the fast path for re-write. */
|
|
|
|
|
ret = ocfs2_lock_get_block(inode, iblock, bh_result, create);
|
2016-03-26 05:21:20 +08:00
|
|
|
|
if (buffer_mapped(bh_result) &&
|
|
|
|
|
!buffer_new(bh_result) &&
|
|
|
|
|
ret == 0)
|
|
|
|
|
goto out;
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
|
2016-03-26 05:21:20 +08:00
|
|
|
|
/* Clear state set by ocfs2_get_block. */
|
|
|
|
|
bh_result->b_state = 0;
|
|
|
|
|
}
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
|
|
|
|
|
dwc = ocfs2_dio_alloc_write_ctx(bh_result, &first_get_block);
|
|
|
|
|
if (unlikely(dwc == NULL)) {
|
|
|
|
|
ret = -ENOMEM;
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (ocfs2_clusters_for_bytes(inode->i_sb, pos + total_len) >
|
|
|
|
|
ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)) &&
|
|
|
|
|
!dwc->dw_orphaned) {
|
|
|
|
|
/*
|
|
|
|
|
* when we are going to alloc extents beyond file size, add the
|
|
|
|
|
* inode to orphan dir, so we can recall those spaces when
|
|
|
|
|
* system crashed during write.
|
|
|
|
|
*/
|
|
|
|
|
ret = ocfs2_add_inode_to_orphan(osb, inode);
|
|
|
|
|
if (ret < 0) {
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
dwc->dw_orphaned = 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ret = ocfs2_inode_lock(inode, &di_bh, 1);
|
|
|
|
|
if (ret) {
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
2016-03-26 05:21:18 +08:00
|
|
|
|
down_write(&oi->ip_alloc_sem);
|
|
|
|
|
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
if (first_get_block) {
|
2018-04-06 07:18:33 +08:00
|
|
|
|
if (ocfs2_sparse_alloc(osb))
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
ret = ocfs2_zero_tail(inode, di_bh, pos);
|
|
|
|
|
else
|
|
|
|
|
ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
|
|
|
|
|
total_len, NULL);
|
|
|
|
|
if (ret < 0) {
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
goto unlock;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ret = ocfs2_write_begin_nolock(inode->i_mapping, pos, len,
|
|
|
|
|
OCFS2_WRITE_DIRECT, NULL,
|
|
|
|
|
(void **)&wc, di_bh, NULL);
|
|
|
|
|
if (ret) {
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
goto unlock;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
desc = &wc->w_desc[0];
|
|
|
|
|
|
|
|
|
|
p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, desc->c_phys);
|
|
|
|
|
BUG_ON(p_blkno == 0);
|
|
|
|
|
p_blkno += iblock & (u64)(ocfs2_clusters_to_blocks(inode->i_sb, 1) - 1);
|
|
|
|
|
|
|
|
|
|
map_bh(bh_result, inode->i_sb, p_blkno);
|
|
|
|
|
bh_result->b_size = len;
|
|
|
|
|
if (desc->c_needs_zero)
|
|
|
|
|
set_buffer_new(bh_result);
|
|
|
|
|
|
2019-10-07 08:57:47 +08:00
|
|
|
|
if (iblock > endblk)
|
|
|
|
|
set_buffer_new(bh_result);
|
|
|
|
|
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
/* May sleep in end_io. It should not happen in a irq context. So defer
|
|
|
|
|
* it to dio work queue. */
|
|
|
|
|
set_buffer_defer_completion(bh_result);
|
|
|
|
|
|
|
|
|
|
if (!list_empty(&wc->w_unwritten_list)) {
|
|
|
|
|
struct ocfs2_unwritten_extent *ue = NULL;
|
|
|
|
|
|
|
|
|
|
ue = list_first_entry(&wc->w_unwritten_list,
|
|
|
|
|
struct ocfs2_unwritten_extent,
|
|
|
|
|
ue_node);
|
|
|
|
|
BUG_ON(ue->ue_cpos != desc->c_cpos);
|
|
|
|
|
/* The physical address may be 0, fill it. */
|
|
|
|
|
ue->ue_phys = desc->c_phys;
|
|
|
|
|
|
|
|
|
|
list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);
|
2018-02-01 08:15:02 +08:00
|
|
|
|
dwc->dw_zero_count += wc->w_unwritten_count;
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
}
|
|
|
|
|
|
2016-12-13 08:41:17 +08:00
|
|
|
|
ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc);
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
BUG_ON(ret != len);
|
|
|
|
|
ret = 0;
|
|
|
|
|
unlock:
|
2016-03-26 05:21:18 +08:00
|
|
|
|
up_write(&oi->ip_alloc_sem);
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
ocfs2_inode_unlock(inode, 1);
|
|
|
|
|
brelse(di_bh);
|
|
|
|
|
out:
|
|
|
|
|
if (ret < 0)
|
|
|
|
|
ret = -EIO;
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2016-11-10 06:42:49 +08:00
|
|
|
|
static int ocfs2_dio_end_io_write(struct inode *inode,
|
|
|
|
|
struct ocfs2_dio_write_ctxt *dwc,
|
|
|
|
|
loff_t offset,
|
|
|
|
|
ssize_t bytes)
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
{
|
|
|
|
|
struct ocfs2_cached_dealloc_ctxt dealloc;
|
|
|
|
|
struct ocfs2_extent_tree et;
|
|
|
|
|
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
|
2016-03-26 05:21:18 +08:00
|
|
|
|
struct ocfs2_inode_info *oi = OCFS2_I(inode);
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
struct ocfs2_unwritten_extent *ue = NULL;
|
|
|
|
|
struct buffer_head *di_bh = NULL;
|
|
|
|
|
struct ocfs2_dinode *di;
|
|
|
|
|
struct ocfs2_alloc_context *data_ac = NULL;
|
|
|
|
|
struct ocfs2_alloc_context *meta_ac = NULL;
|
|
|
|
|
handle_t *handle = NULL;
|
|
|
|
|
loff_t end = offset + bytes;
|
2021-04-10 04:27:29 +08:00
|
|
|
|
int ret = 0, credits = 0;
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
|
|
|
|
|
ocfs2_init_dealloc_ctxt(&dealloc);
|
|
|
|
|
|
|
|
|
|
/* We do clear unwritten, delete orphan, change i_size here. If neither
|
|
|
|
|
* of these happen, we can skip all this. */
|
|
|
|
|
if (list_empty(&dwc->dw_zero_list) &&
|
|
|
|
|
end <= i_size_read(inode) &&
|
|
|
|
|
!dwc->dw_orphaned)
|
|
|
|
|
goto out;
|
|
|
|
|
|
2016-03-26 05:21:18 +08:00
|
|
|
|
ret = ocfs2_inode_lock(inode, &di_bh, 1);
|
|
|
|
|
if (ret < 0) {
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
goto out;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
down_write(&oi->ip_alloc_sem);
|
|
|
|
|
|
2022-03-23 05:38:45 +08:00
|
|
|
|
/* Delete orphan before acquire i_rwsem. */
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
if (dwc->dw_orphaned) {
|
|
|
|
|
BUG_ON(dwc->dw_writer_pid != task_pid_nr(current));
|
|
|
|
|
|
|
|
|
|
end = end > i_size_read(inode) ? end : 0;
|
|
|
|
|
|
|
|
|
|
ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
|
|
|
|
|
!!end, end);
|
|
|
|
|
if (ret < 0)
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
}
|
|
|
|
|
|
2016-12-10 08:10:15 +08:00
|
|
|
|
di = (struct ocfs2_dinode *)di_bh->b_data;
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
|
|
|
|
|
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
|
|
|
|
|
|
2018-02-01 08:15:06 +08:00
|
|
|
|
/* Attach dealloc with extent tree in case that we may reuse extents
|
|
|
|
|
* which are already unlinked from current extent tree due to extent
|
|
|
|
|
* rotation and merging.
|
|
|
|
|
*/
|
|
|
|
|
et.et_dealloc = &dealloc;
|
|
|
|
|
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
|
|
|
|
|
&data_ac, &meta_ac);
|
2016-03-26 05:21:23 +08:00
|
|
|
|
if (ret) {
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
goto unlock;
|
|
|
|
|
}
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
|
|
|
|
|
credits = ocfs2_calc_extend_credits(inode->i_sb, &di->id2.i_list);
|
|
|
|
|
|
|
|
|
|
handle = ocfs2_start_trans(osb, credits);
|
|
|
|
|
if (IS_ERR(handle)) {
|
|
|
|
|
ret = PTR_ERR(handle);
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
goto unlock;
|
|
|
|
|
}
|
|
|
|
|
ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
|
|
|
|
|
OCFS2_JOURNAL_ACCESS_WRITE);
|
|
|
|
|
if (ret) {
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
goto commit;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) {
|
|
|
|
|
ret = ocfs2_mark_extent_written(inode, &et, handle,
|
|
|
|
|
ue->ue_cpos, 1,
|
|
|
|
|
ue->ue_phys,
|
|
|
|
|
meta_ac, &dealloc);
|
|
|
|
|
if (ret < 0) {
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (end > i_size_read(inode)) {
|
|
|
|
|
ret = ocfs2_set_inode_size(handle, inode, di_bh, end);
|
|
|
|
|
if (ret < 0)
|
|
|
|
|
mlog_errno(ret);
|
|
|
|
|
}
|
|
|
|
|
commit:
|
|
|
|
|
ocfs2_commit_trans(osb, handle);
|
|
|
|
|
unlock:
|
2016-03-26 05:21:18 +08:00
|
|
|
|
up_write(&oi->ip_alloc_sem);
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
ocfs2_inode_unlock(inode, 1);
|
|
|
|
|
brelse(di_bh);
|
|
|
|
|
out:
|
|
|
|
|
if (data_ac)
|
|
|
|
|
ocfs2_free_alloc_context(data_ac);
|
|
|
|
|
if (meta_ac)
|
|
|
|
|
ocfs2_free_alloc_context(meta_ac);
|
2016-03-26 05:21:23 +08:00
|
|
|
|
ocfs2_run_deallocs(osb, &dealloc);
|
|
|
|
|
ocfs2_dio_free_write_ctx(inode, dwc);
|
2016-11-10 06:42:49 +08:00
|
|
|
|
|
|
|
|
|
return ret;
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
|
|
|
|
|
* particularly interested in the aio/dio case. We use the rw_lock DLM lock
|
|
|
|
|
* to protect io on one node from truncation on another.
|
|
|
|
|
*/
|
|
|
|
|
static int ocfs2_dio_end_io(struct kiocb *iocb,
|
|
|
|
|
loff_t offset,
|
|
|
|
|
ssize_t bytes,
|
|
|
|
|
void *private)
|
|
|
|
|
{
|
|
|
|
|
struct inode *inode = file_inode(iocb->ki_filp);
|
|
|
|
|
int level;
|
2016-11-10 06:42:49 +08:00
|
|
|
|
int ret = 0;
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
|
|
|
|
|
/* this io's submitter should not have unlocked this before we could */
|
|
|
|
|
BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
|
|
|
|
|
|
2018-11-17 07:08:25 +08:00
|
|
|
|
if (bytes <= 0)
|
|
|
|
|
mlog_ratelimited(ML_ERROR, "Direct IO failed, bytes = %lld",
|
|
|
|
|
(long long)bytes);
|
|
|
|
|
if (private) {
|
|
|
|
|
if (bytes > 0)
|
|
|
|
|
ret = ocfs2_dio_end_io_write(inode, private, offset,
|
|
|
|
|
bytes);
|
|
|
|
|
else
|
|
|
|
|
ocfs2_dio_free_write_ctx(inode, private);
|
|
|
|
|
}
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
|
|
|
|
|
ocfs2_iocb_clear_rw_locked(iocb);
|
|
|
|
|
|
|
|
|
|
level = ocfs2_iocb_rw_locked_level(iocb);
|
|
|
|
|
ocfs2_rw_unlock(inode, level);
|
2016-11-10 06:42:49 +08:00
|
|
|
|
return ret;
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
}
|
|
|
|
|
|
2016-04-07 23:51:58 +08:00
|
|
|
|
static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
{
|
|
|
|
|
struct file *file = iocb->ki_filp;
|
2015-12-05 12:45:44 +08:00
|
|
|
|
struct inode *inode = file->f_mapping->host;
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
|
|
|
|
|
get_block_t *get_block;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Fallback to buffered I/O if we see an inode without
|
|
|
|
|
* extents.
|
|
|
|
|
*/
|
|
|
|
|
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
/* Fallback to buffered I/O if we do not support append dio. */
|
2016-04-07 23:51:58 +08:00
|
|
|
|
if (iocb->ki_pos + iter->count > i_size_read(inode) &&
|
|
|
|
|
!ocfs2_supports_append_dio(osb))
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
if (iov_iter_rw(iter) == READ)
|
2017-11-16 09:31:44 +08:00
|
|
|
|
get_block = ocfs2_lock_get_block;
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
else
|
2017-11-16 09:31:44 +08:00
|
|
|
|
get_block = ocfs2_dio_wr_get_block;
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
|
|
|
|
|
return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
|
2016-04-07 23:51:58 +08:00
|
|
|
|
iter, get_block,
|
ocfs2: fix sparse file & data ordering issue in direct io
There are mainly three issues in the direct io code path after commit
24c40b329e03 ("ocfs2: implement ocfs2_direct_IO_write"):
* Does not support sparse file.
* Does not support data ordering. eg: when write to a file hole, it
will alloc extent first. If system crashed before io finished, data
will corrupt.
* Potential risk when doing aio+dio. The -EIOCBQUEUED return value is
likely to be ignored by ocfs2_direct_IO_write().
To resolve above problems, re-design direct io code with following ideas:
* Use buffer io to fill in holes. And this will make better
performance also.
* Clear unwritten after direct write finished. So we can make sure
meta data changes after data write to disk. (Unwritten extent is
invisible to user, from user's view, meta data is not changed when
allocate an unwritten extent.)
* Clear ocfs2_direct_IO_write(). Do all ending work in end_io.
This patch has passed fs,dio,ltp-aiodio.part1,ltp-aiodio.part2,ltp-aiodio.part4
test cases of ltp.
For performance improvement, see following test result:
ocfs2 cluster size 1MB, ocfs2 volume is mounted on /mnt/.
The original way:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 1707.83 s, 2.5 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 582.705 s, 7.4 MB/s
After this patch:
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=4K count=1048576 oflag=direct
1048576+0 records in
1048576+0 records out
4294967296 bytes (4.3 GB) copied, 64.6412 s, 66.4 MB/s
+ rm /mnt/test.img -f
+ dd if=/dev/zero of=/mnt/test.img bs=256K count=16384 oflag=direct
16384+0 records in
16384+0 records out
4294967296 bytes (4.3 GB) copied, 34.7611 s, 124 MB/s
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-26 05:21:09 +08:00
|
|
|
|
ocfs2_dio_end_io, NULL, 0);
|
|
|
|
|
}
|
|
|
|
|
|
2006-06-28 19:26:44 +08:00
|
|
|
|
const struct address_space_operations ocfs2_aops = {
|
2022-02-10 04:22:12 +08:00
|
|
|
|
.dirty_folio = block_dirty_folio,
|
2022-04-29 23:12:16 +08:00
|
|
|
|
.read_folio = ocfs2_read_folio,
|
fs: convert mpage_readpages to mpage_readahead
Implement the new readahead aop and convert all callers (block_dev,
exfat, ext2, fat, gfs2, hpfs, isofs, jfs, nilfs2, ocfs2, omfs, qnx6,
reiserfs & udf).
The callers are all trivial except for GFS2 & OCFS2.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com> # ocfs2
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com> # ocfs2
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Cc: Chao Yu <yuchao0@huawei.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Gao Xiang <gaoxiang25@huawei.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Link: http://lkml.kernel.org/r/20200414150233.24495-17-willy@infradead.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-02 12:47:02 +08:00
|
|
|
|
.readahead = ocfs2_readahead,
|
2009-03-05 16:22:21 +08:00
|
|
|
|
.writepage = ocfs2_writepage,
|
|
|
|
|
.write_begin = ocfs2_write_begin,
|
|
|
|
|
.write_end = ocfs2_write_end,
|
|
|
|
|
.bmap = ocfs2_bmap,
|
|
|
|
|
.direct_IO = ocfs2_direct_IO,
|
2022-02-10 04:21:34 +08:00
|
|
|
|
.invalidate_folio = block_invalidate_folio,
|
2022-05-01 12:10:21 +08:00
|
|
|
|
.release_folio = ocfs2_release_folio,
|
2009-03-05 16:22:21 +08:00
|
|
|
|
.migratepage = buffer_migrate_page,
|
|
|
|
|
.is_partially_uptodate = block_is_partially_uptodate,
|
2009-09-16 17:50:16 +08:00
|
|
|
|
.error_remove_page = generic_error_remove_page,
|
2005-12-16 06:31:24 +08:00
|
|
|
|
};
|