Merge branch 'xfs-collapse-range' into for-next

This commit is contained in:
Dave Chinner 2014-03-13 19:11:06 +11:00
commit b6db0551fd
8 changed files with 390 additions and 8 deletions

View File

@ -231,7 +231,13 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
return -EINVAL;
/* Return error if mode is not supported */
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
return -EOPNOTSUPP;
/* Punch hole and zero range are mutually exclusive */
if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) ==
(FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
return -EOPNOTSUPP;
/* Punch hole must have keep size set */
@ -239,11 +245,20 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
!(mode & FALLOC_FL_KEEP_SIZE))
return -EOPNOTSUPP;
/* Collapse range should only be used exclusively. */
if ((mode & FALLOC_FL_COLLAPSE_RANGE) &&
(mode & ~FALLOC_FL_COLLAPSE_RANGE))
return -EINVAL;
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
/* It's not possible punch hole on append only file */
if (mode & FALLOC_FL_PUNCH_HOLE && IS_APPEND(inode))
/*
* It's not possible to punch hole or perform collapse range
* on append only file
*/
if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE)
&& IS_APPEND(inode))
return -EPERM;
if (IS_IMMUTABLE(inode))
@ -271,6 +286,14 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
return -EFBIG;
/*
* There is no need to overlap collapse range with EOF, in which case
* it is effectively a truncate operation
*/
if ((mode & FALLOC_FL_COLLAPSE_RANGE) &&
(offset + len >= i_size_read(inode)))
return -EINVAL;
if (!file->f_op->fallocate)
return -EOPNOTSUPP;

View File

@ -5378,3 +5378,196 @@ error0:
}
return error;
}
/*
* Shift extent records to the left to cover a hole.
*
* The maximum number of extents to be shifted in a single operation
* is @num_exts, and @current_ext keeps track of the current extent
* index we have shifted. @offset_shift_fsb is the length by which each
* extent is shifted. If there is no hole to shift the extents
* into, this will be considered invalid operation and we abort immediately.
*/
int
xfs_bmap_shift_extents(
struct xfs_trans *tp,
struct xfs_inode *ip,
int *done,
xfs_fileoff_t start_fsb,
xfs_fileoff_t offset_shift_fsb,
xfs_extnum_t *current_ext,
xfs_fsblock_t *firstblock,
struct xfs_bmap_free *flist,
int num_exts)
{
struct xfs_btree_cur *cur;
struct xfs_bmbt_rec_host *gotp;
struct xfs_bmbt_irec got;
struct xfs_bmbt_irec left;
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp;
xfs_extnum_t nexts = 0;
xfs_fileoff_t startoff;
int error = 0;
int i;
int whichfork = XFS_DATA_FORK;
int logflags;
xfs_filblks_t blockcount = 0;
if (unlikely(XFS_TEST_ERROR(
(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
XFS_ERROR_REPORT("xfs_bmap_shift_extents",
XFS_ERRLEVEL_LOW, mp);
return XFS_ERROR(EFSCORRUPTED);
}
if (XFS_FORCED_SHUTDOWN(mp))
return XFS_ERROR(EIO);
ASSERT(current_ext != NULL);
ifp = XFS_IFORK_PTR(ip, whichfork);
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
/* Read in all the extents */
error = xfs_iread_extents(tp, ip, whichfork);
if (error)
return error;
}
/*
* If *current_ext is 0, we would need to lookup the extent
* from where we would start shifting and store it in gotp.
*/
if (!*current_ext) {
gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext);
/*
* gotp can be null in 2 cases: 1) if there are no extents
* or 2) start_fsb lies in a hole beyond which there are
* no extents. Either way, we are done.
*/
if (!gotp) {
*done = 1;
return 0;
}
}
/* We are going to change core inode */
logflags = XFS_ILOG_CORE;
if (ifp->if_flags & XFS_IFBROOT) {
cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
cur->bc_private.b.firstblock = *firstblock;
cur->bc_private.b.flist = flist;
cur->bc_private.b.flags = 0;
} else {
cur = NULL;
logflags |= XFS_ILOG_DEXT;
}
while (nexts++ < num_exts &&
*current_ext < XFS_IFORK_NEXTENTS(ip, whichfork)) {
gotp = xfs_iext_get_ext(ifp, *current_ext);
xfs_bmbt_get_all(gotp, &got);
startoff = got.br_startoff - offset_shift_fsb;
/*
* Before shifting extent into hole, make sure that the hole
* is large enough to accomodate the shift.
*/
if (*current_ext) {
xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
*current_ext - 1), &left);
if (startoff < left.br_startoff + left.br_blockcount)
error = XFS_ERROR(EINVAL);
} else if (offset_shift_fsb > got.br_startoff) {
/*
* When first extent is shifted, offset_shift_fsb
* should be less than the stating offset of
* the first extent.
*/
error = XFS_ERROR(EINVAL);
}
if (error)
goto del_cursor;
if (cur) {
error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
got.br_startblock,
got.br_blockcount,
&i);
if (error)
goto del_cursor;
XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
}
/* Check if we can merge 2 adjacent extents */
if (*current_ext &&
left.br_startoff + left.br_blockcount == startoff &&
left.br_startblock + left.br_blockcount ==
got.br_startblock &&
left.br_state == got.br_state &&
left.br_blockcount + got.br_blockcount <= MAXEXTLEN) {
blockcount = left.br_blockcount +
got.br_blockcount;
xfs_iext_remove(ip, *current_ext, 1, 0);
if (cur) {
error = xfs_btree_delete(cur, &i);
if (error)
goto del_cursor;
XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
}
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
gotp = xfs_iext_get_ext(ifp, --*current_ext);
xfs_bmbt_get_all(gotp, &got);
/* Make cursor point to the extent we will update */
if (cur) {
error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
got.br_startblock,
got.br_blockcount,
&i);
if (error)
goto del_cursor;
XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
}
xfs_bmbt_set_blockcount(gotp, blockcount);
got.br_blockcount = blockcount;
} else {
/* We have to update the startoff */
xfs_bmbt_set_startoff(gotp, startoff);
got.br_startoff = startoff;
}
if (cur) {
error = xfs_bmbt_update(cur, got.br_startoff,
got.br_startblock,
got.br_blockcount,
got.br_state);
if (error)
goto del_cursor;
}
(*current_ext)++;
}
/* Check if we are done */
if (*current_ext == XFS_IFORK_NEXTENTS(ip, whichfork))
*done = 1;
del_cursor:
if (cur)
xfs_btree_del_cursor(cur,
error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
xfs_trans_log_inode(tp, ip, logflags);
return error;
}

View File

@ -127,6 +127,16 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
{ BMAP_RIGHT_FILLING, "RF" }, \
{ BMAP_ATTRFORK, "ATTR" }
/*
* This macro is used to determine how many extents will be shifted
* in one write transaction. We could require two splits,
* an extent move on the first and an extent merge on the second,
* So it is proper that one extent is shifted inside write transaction
* at a time.
*/
#define XFS_BMAP_MAX_SHIFT_EXTENTS 1
#ifdef DEBUG
void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
int whichfork, unsigned long caller_ip);
@ -169,5 +179,10 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
xfs_extnum_t num);
uint xfs_default_attroffset(struct xfs_inode *ip);
int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
int *done, xfs_fileoff_t start_fsb,
xfs_fileoff_t offset_shift_fsb, xfs_extnum_t *current_ext,
xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist,
int num_exts);
#endif /* __XFS_BMAP_H__ */

View File

@ -1349,7 +1349,6 @@ xfs_free_file_space(
* the freeing of the space succeeds at ENOSPC.
*/
tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
tp->t_flags |= XFS_TRANS_RESERVE;
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0);
/*
@ -1467,6 +1466,102 @@ out:
}
/*
* xfs_collapse_file_space()
* This routine frees disk space and shift extent for the given file.
* The first thing we do is to free data blocks in the specified range
* by calling xfs_free_file_space(). It would also sync dirty data
* and invalidate page cache over the region on which collapse range
* is working. And Shift extent records to the left to cover a hole.
* RETURNS:
* 0 on success
* errno on error
*
*/
int
xfs_collapse_file_space(
struct xfs_inode *ip,
xfs_off_t offset,
xfs_off_t len)
{
int done = 0;
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
int error;
xfs_extnum_t current_ext = 0;
struct xfs_bmap_free free_list;
xfs_fsblock_t first_block;
int committed;
xfs_fileoff_t start_fsb;
xfs_fileoff_t shift_fsb;
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
trace_xfs_collapse_file_space(ip);
start_fsb = XFS_B_TO_FSB(mp, offset + len);
shift_fsb = XFS_B_TO_FSB(mp, len);
error = xfs_free_file_space(ip, offset, len);
if (error)
return error;
while (!error && !done) {
tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
tp->t_flags |= XFS_TRANS_RESERVE;
/*
* We would need to reserve permanent block for transaction.
* This will come into picture when after shifting extent into
* hole we found that adjacent extents can be merged which
* may lead to freeing of a block during record update.
*/
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
if (error) {
ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
xfs_trans_cancel(tp, 0);
break;
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
ip->i_gdquot, ip->i_pdquot,
XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
XFS_QMOPT_RES_REGBLKS);
if (error)
goto out;
xfs_trans_ijoin(tp, ip, 0);
xfs_bmap_init(&free_list, &first_block);
/*
* We are using the write transaction in which max 2 bmbt
* updates are allowed
*/
error = xfs_bmap_shift_extents(tp, ip, &done, start_fsb,
shift_fsb, &current_ext,
&first_block, &free_list,
XFS_BMAP_MAX_SHIFT_EXTENTS);
if (error)
goto out;
error = xfs_bmap_finish(&tp, &free_list, &committed);
if (error)
goto out;
error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
}
return error;
out:
xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
/*
* We need to check that the format of the data fork in the temporary inode is
* valid for the target inode before doing the swap. This is not a problem with

View File

@ -99,6 +99,8 @@ int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t len);
int xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t len);
int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
xfs_off_t len);
/* EOF block manipulation functions */
bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);

View File

@ -823,7 +823,8 @@ xfs_file_fallocate(
if (!S_ISREG(inode->i_mode))
return -EINVAL;
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
return -EOPNOTSUPP;
xfs_ilock(ip, XFS_IOLOCK_EXCL);
@ -831,6 +832,20 @@ xfs_file_fallocate(
error = xfs_free_file_space(ip, offset, len);
if (error)
goto out_unlock;
} else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
if (offset & blksize_mask || len & blksize_mask) {
error = -EINVAL;
goto out_unlock;
}
ASSERT(offset + len < i_size_read(inode));
new_size = i_size_read(inode) - len;
error = xfs_collapse_file_space(ip, offset, len);
if (error)
goto out_unlock;
} else {
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
offset + len > i_size_read(inode)) {
@ -840,8 +855,11 @@ xfs_file_fallocate(
goto out_unlock;
}
error = xfs_alloc_file_space(ip, offset, len,
XFS_BMAPI_PREALLOC);
if (mode & FALLOC_FL_ZERO_RANGE)
error = xfs_zero_file_space(ip, offset, len);
else
error = xfs_alloc_file_space(ip, offset, len,
XFS_BMAPI_PREALLOC);
if (error)
goto out_unlock;
}
@ -859,7 +877,7 @@ xfs_file_fallocate(
if (ip->i_d.di_mode & S_IXGRP)
ip->i_d.di_mode &= ~S_ISGID;
if (!(mode & FALLOC_FL_PUNCH_HOLE))
if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE)))
ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);

View File

@ -603,6 +603,7 @@ DEFINE_INODE_EVENT(xfs_readlink);
DEFINE_INODE_EVENT(xfs_inactive_symlink);
DEFINE_INODE_EVENT(xfs_alloc_file_space);
DEFINE_INODE_EVENT(xfs_free_file_space);
DEFINE_INODE_EVENT(xfs_collapse_file_space);
DEFINE_INODE_EVENT(xfs_readdir);
#ifdef CONFIG_XFS_POSIX_ACL
DEFINE_INODE_EVENT(xfs_get_acl);

View File

@ -5,5 +5,40 @@
#define FALLOC_FL_PUNCH_HOLE 0x02 /* de-allocates range */
#define FALLOC_FL_NO_HIDE_STALE 0x04 /* reserved codepoint */
/*
* FALLOC_FL_COLLAPSE_RANGE is used to remove a range of a file
* without leaving a hole in the file. The contents of the file beyond
* the range being removed is appended to the start offset of the range
* being removed (i.e. the hole that was punched is "collapsed"),
* resulting in a file layout that looks like the range that was
* removed never existed. As such collapsing a range of a file changes
* the size of the file, reducing it by the same length of the range
* that has been removed by the operation.
*
* Different filesystems may implement different limitations on the
* granularity of the operation. Most will limit operations to
* filesystem block size boundaries, but this boundary may be larger or
* smaller depending on the filesystem and/or the configuration of the
* filesystem or file.
*
* Attempting to collapse a range that crosses the end of the file is
* considered an illegal operation - just use ftruncate(2) if you need
* to collapse a range that crosses EOF.
*/
#define FALLOC_FL_COLLAPSE_RANGE 0x08
/*
* FALLOC_FL_ZERO_RANGE is used to convert a range of file to zeros preferably
* without issuing data IO. Blocks should be preallocated for the regions that
* span holes in the file, and the entire range is preferable converted to
* unwritten extents - even though file system may choose to zero out the
* extent or do whatever which will result in reading zeros from the range
* while the range remains allocated for the file.
*
* This can be also used to preallocate blocks past EOF in the same way as
* with fallocate. Flag FALLOC_FL_KEEP_SIZE should cause the inode
* size to remain the same.
*/
#define FALLOC_FL_ZERO_RANGE 0x10
#endif /* _UAPI_FALLOC_H_ */