From da062d16a897c0759ae907e786bc0bea950c0c9d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 12 Jul 2021 12:58:47 -0700 Subject: [PATCH 1/7] xfs: check for sparse inode clusters that cross new EOAG when shrinking While running xfs/168, I noticed occasional write verifier shutdowns involving inodes at the very end of the filesystem. Existing inode btree validation code checks that all inode clusters are fully contained within the filesystem. However, due to inadequate checking in the fs shrink code, it's possible that there could be a sparse inode cluster at the end of the filesystem where the upper inodes of the cluster are marked as holes and the corresponding blocks are free. In this case, the last blocks in the AG are listed in the bnobt. This enables the shrink to proceed but results in a filesystem that trips the inode verifiers. Fix this by disallowing the shrink. Signed-off-by: Darrick J. Wong Reviewed-by: Gao Xiang --- fs/xfs/libxfs/xfs_ag.c | 8 ++++++ fs/xfs/libxfs/xfs_ialloc.c | 55 ++++++++++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_ialloc.h | 3 +++ 3 files changed, 66 insertions(+) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 778ec52cce70..ee9ec0c50bec 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -803,6 +803,14 @@ xfs_ag_shrink_space( args.fsbno = XFS_AGB_TO_FSB(mp, agno, aglen - delta); + /* + * Make sure that the last inode cluster cannot overlap with the new + * end of the AG, even if it's sparse. + */ + error = xfs_ialloc_check_shrink(*tpp, agno, agibp, aglen - delta); + if (error) + return error; + /* * Disable perag reservations so it doesn't cause the allocation request * to fail. We'll reestablish reservation before we return. diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 57d9cb632983..aaf8805a82df 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2928,3 +2928,58 @@ xfs_ialloc_calc_rootino( return XFS_AGINO_TO_INO(mp, 0, XFS_AGB_TO_AGINO(mp, first_bno)); } + +/* + * Ensure there are not sparse inode clusters that cross the new EOAG. + * + * This is a no-op for non-spinode filesystems since clusters are always fully + * allocated and checking the bnobt suffices. However, a spinode filesystem + * could have a record where the upper inodes are free blocks. If those blocks + * were removed from the filesystem, the inode record would extend beyond EOAG, + * which will be flagged as corruption. + */ +int +xfs_ialloc_check_shrink( + struct xfs_trans *tp, + xfs_agnumber_t agno, + struct xfs_buf *agibp, + xfs_agblock_t new_length) +{ + struct xfs_inobt_rec_incore rec; + struct xfs_btree_cur *cur; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_perag *pag; + xfs_agino_t agino = XFS_AGB_TO_AGINO(mp, new_length); + int has; + int error; + + if (!xfs_sb_version_hassparseinodes(&mp->m_sb)) + return 0; + + pag = xfs_perag_get(mp, agno); + cur = xfs_inobt_init_cursor(mp, tp, agibp, pag, XFS_BTNUM_INO); + + /* Look up the inobt record that would correspond to the new EOFS. */ + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has); + if (error || !has) + goto out; + + error = xfs_inobt_get_rec(cur, &rec, &has); + if (error) + goto out; + + if (!has) { + error = -EFSCORRUPTED; + goto out; + } + + /* If the record covers inodes that would be beyond EOFS, bail out. */ + if (rec.ir_startino + XFS_INODES_PER_CHUNK > agino) { + error = -ENOSPC; + goto out; + } +out: + xfs_btree_del_cursor(cur, error); + xfs_perag_put(pag); + return error; +} diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 9df7c80408ff..9a2112b4ad5e 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -122,4 +122,7 @@ int xfs_ialloc_cluster_alignment(struct xfs_mount *mp); void xfs_ialloc_setup_geometry(struct xfs_mount *mp); xfs_ino_t xfs_ialloc_calc_rootino(struct xfs_mount *mp, int sunit); +int xfs_ialloc_check_shrink(struct xfs_trans *tp, xfs_agnumber_t agno, + struct xfs_buf *agibp, xfs_agblock_t new_length); + #endif /* __XFS_IALLOC_H__ */ From 5838d0356bb3c320867c393f12b169c01a870bda Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 12 Jul 2021 12:58:48 -0700 Subject: [PATCH 2/7] xfs: reset child dir '..' entry when unlinking child While running xfs/168, I noticed a second source of post-shrink corruption errors causing shutdowns. Let's say that directory B has a low inode number and is a child of directory A, which has a high number. If B is empty but open, and unlinked from A, B's dotdot link continues to point to A. If A is then unlinked and the filesystem shrunk so that A is no longer a valid inode, a subsequent AIL push of B will trip the inode verifiers because the dotdot entry points outside of the filesystem. To avoid this problem, reset B's dotdot entry to the root directory when unlinking directories, since the root directory cannot be removed. Signed-off-by: Darrick J. Wong Reviewed-by: Gao Xiang --- fs/xfs/xfs_inode.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index a835ceb79ba5..990b72ae3635 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2763,6 +2763,19 @@ xfs_remove( error = xfs_droplink(tp, ip); if (error) goto out_trans_cancel; + + /* + * Point the unlinked child directory's ".." entry to the root + * directory to eliminate back-references to inodes that may + * get freed before the child directory is closed. If the fs + * gets shrunk, this can lead to dirent inode validation errors. + */ + if (dp->i_ino != tp->t_mountp->m_sb.sb_rootino) { + error = xfs_dir_replace(tp, ip, &xfs_name_dotdot, + tp->t_mountp->m_sb.sb_rootino, 0); + if (error) + return error; + } } else { /* * When removing a non-directory we need to log the parent From 83193e5ebb0164d612aa620ceab7d3746e80e2a4 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 12 Jul 2021 12:58:50 -0700 Subject: [PATCH 3/7] xfs: correct the narrative around misaligned rtinherit/extszinherit dirs While auditing the realtime growfs code, I realized that the GROWFSRT ioctl (and by extension xfs_growfs) has always allowed sysadmins to change the realtime extent size when adding a realtime section to the filesystem. Since we also have always allowed sysadmins to set RTINHERIT and EXTSZINHERIT on directories even if there is no realtime device, this invalidates the premise laid out in the comments added in commit 603f000b15f2. In other words, this is not a case of inadequate metadata validation. This is a case of nearly forgotten (and apparently untested) but supported functionality. Update the comments to reflect what we've learned, and remove the log message about correcting the misalignment. Fixes: 603f000b15f2 ("xfs: validate extsz hints against rt extent size when rtinherit is set") Signed-off-by: Darrick J. Wong Reviewed-by: Carlos Maiolino Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_inode_buf.c | 28 ++++++++++++++++------------ fs/xfs/libxfs/xfs_trans_inode.c | 10 ++++------ fs/xfs/xfs_ioctl.c | 8 ++++---- 3 files changed, 24 insertions(+), 22 deletions(-) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 04ce361688f7..84ea2e0af9f0 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -592,23 +592,27 @@ xfs_inode_validate_extsize( /* * This comment describes a historic gap in this verifier function. * - * On older kernels, the extent size hint verifier doesn't check that - * the extent size hint is an integer multiple of the realtime extent - * size on a directory with both RTINHERIT and EXTSZINHERIT flags set. - * The verifier has always enforced the alignment rule for regular - * files with the REALTIME flag set. + * For a directory with both RTINHERIT and EXTSZINHERIT flags set, this + * function has never checked that the extent size hint is an integer + * multiple of the realtime extent size. Since we allow users to set + * this combination on non-rt filesystems /and/ to change the rt + * extent size when adding a rt device to a filesystem, the net effect + * is that users can configure a filesystem anticipating one rt + * geometry and change their minds later. Directories do not use the + * extent size hint, so this is harmless for them. * * If a directory with a misaligned extent size hint is allowed to * propagate that hint into a new regular realtime file, the result * is that the inode cluster buffer verifier will trigger a corruption - * shutdown the next time it is run. + * shutdown the next time it is run, because the verifier has always + * enforced the alignment rule for regular files. * - * Unfortunately, there could be filesystems with these misconfigured - * directories in the wild, so we cannot add a check to this verifier - * at this time because that will result a new source of directory - * corruption errors when reading an existing filesystem. Instead, we - * permit the misconfiguration to pass through the verifiers so that - * callers of this function can correct and mitigate externally. + * Because we allow administrators to set a new rt extent size when + * adding a rt section, we cannot add a check to this verifier because + * that will result a new source of directory corruption errors when + * reading an existing filesystem. Instead, we rely on callers to + * decide when alignment checks are appropriate, and fix things up as + * needed. */ if (rt_flag) diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index 8d595a5c4abd..16f723ebe8dd 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -143,16 +143,14 @@ xfs_trans_log_inode( } /* - * Inode verifiers on older kernels don't check that the extent size - * hint is an integer multiple of the rt extent size on a directory - * with both rtinherit and extszinherit flags set. If we're logging a - * directory that is misconfigured in this way, clear the hint. + * Inode verifiers do not check that the extent size hint is an integer + * multiple of the rt extent size on a directory with both rtinherit + * and extszinherit flags set. If we're logging a directory that is + * misconfigured in this way, clear the hint. */ if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) { - xfs_info_once(ip->i_mount, - "Correcting misaligned extent size hint in inode 0x%llx.", ip->i_ino); ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT); ip->i_extsize = 0; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 65270e63c032..275d1c3123bd 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1292,10 +1292,10 @@ xfs_ioctl_setattr_check_extsize( new_diflags = xfs_flags2diflags(ip, fa->fsx_xflags); /* - * Inode verifiers on older kernels don't check that the extent size - * hint is an integer multiple of the rt extent size on a directory - * with both rtinherit and extszinherit flags set. Don't let sysadmins - * misconfigure directories. + * Inode verifiers do not check that the extent size hint is an integer + * multiple of the rt extent size on a directory with both rtinherit + * and extszinherit flags set. Don't let sysadmins misconfigure + * directories. */ if ((new_diflags & XFS_DIFLAG_RTINHERIT) && (new_diflags & XFS_DIFLAG_EXTSZINHERIT)) { From 5aa5b278237f356f86205c4b03d4cc64a293850a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 12 Jul 2021 12:58:51 -0700 Subject: [PATCH 4/7] xfs: don't expose misaligned extszinherit hints to userspace Commit 603f000b15f2 changed xfs_ioctl_setattr_check_extsize to reject an attempt to set an EXTSZINHERIT extent size hint on a directory with RTINHERIT set if the hint isn't a multiple of the realtime extent size. However, I have recently discovered that it is possible to change the realtime extent size when adding a rt device to a filesystem, which means that the existence of directories with misaligned inherited hints is not an accident. As a result, it's possible that someone could have set a valid hint and added an rt volume with a different rt extent size, which invalidates the ondisk hints. After such a sequence, FSGETXATTR will report a misaligned hint, which FSSETXATTR will trip over, causing confusion if the user was doing the usual GET/SET sequence to change some other attribute. Change xfs_fill_fsxattr to omit the hint if it isn't aligned properly. Fixes: 603f000b15f2 ("xfs: validate extsz hints against rt extent size when rtinherit is set") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_ioctl.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 275d1c3123bd..16039ea10ac9 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1065,7 +1065,24 @@ xfs_fill_fsxattr( fileattr_fill_xflags(fa, xfs_ip2xflags(ip)); - fa->fsx_extsize = XFS_FSB_TO_B(mp, ip->i_extsize); + if (ip->i_diflags & XFS_DIFLAG_EXTSIZE) { + fa->fsx_extsize = XFS_FSB_TO_B(mp, ip->i_extsize); + } else if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) { + /* + * Don't let a misaligned extent size hint on a directory + * escape to userspace if it won't pass the setattr checks + * later. + */ + if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && + ip->i_extsize % mp->m_sb.sb_rextsize > 0) { + fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | + FS_XFLAG_EXTSZINHERIT); + fa->fsx_extsize = 0; + } else { + fa->fsx_extsize = XFS_FSB_TO_B(mp, ip->i_extsize); + } + } + if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) fa->fsx_cowextsize = XFS_FSB_TO_B(mp, ip->i_cowextsize); fa->fsx_projid = ip->i_projid; From 0e2af9296f4f9c4c815ced2beb21093af7c38644 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 12 Jul 2021 12:58:48 -0700 Subject: [PATCH 5/7] xfs: improve FSGROWFSRT precondition checking Improve the checking at the start of a realtime grow operation so that we avoid accidentally set a new extent size that is too large and avoid adding an rt volume to a filesystem with rmap or reflink because we don't support rt rmap or reflink yet. While we're at it, separate the checks so that we're only testing one aspect at a time. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_rtalloc.c | 39 ++++++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 4e7be6b4ca8e..8f6a05db4468 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -923,16 +923,41 @@ xfs_growfs_rt( uint8_t *rsum_cache; /* old summary cache */ sbp = &mp->m_sb; - /* - * Initial error checking. - */ + if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (mp->m_rtdev_targp == NULL || mp->m_rbmip == NULL || - (nrblocks = in->newblocks) <= sbp->sb_rblocks || - (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize))) + + /* Needs to have been mounted with an rt device. */ + if (!XFS_IS_REALTIME_MOUNT(mp)) return -EINVAL; - if ((error = xfs_sb_validate_fsb_count(sbp, nrblocks))) + /* + * Mount should fail if the rt bitmap/summary files don't load, but + * we'll check anyway. + */ + if (!mp->m_rbmip || !mp->m_rsumip) + return -EINVAL; + + /* Shrink not supported. */ + if (in->newblocks <= sbp->sb_rblocks) + return -EINVAL; + + /* Can only change rt extent size when adding rt volume. */ + if (sbp->sb_rblocks > 0 && in->extsize != sbp->sb_rextsize) + return -EINVAL; + + /* Range check the extent size. */ + if (XFS_FSB_TO_B(mp, in->extsize) > XFS_MAX_RTEXTSIZE || + XFS_FSB_TO_B(mp, in->extsize) < XFS_MIN_RTEXTSIZE) + return -EINVAL; + + /* Unsupported realtime features. */ + if (xfs_sb_version_hasrmapbt(&mp->m_sb) || + xfs_sb_version_hasreflink(&mp->m_sb)) + return -EOPNOTSUPP; + + nrblocks = in->newblocks; + error = xfs_sb_validate_fsb_count(sbp, nrblocks); + if (error) return error; /* * Read in the last block of the device, make sure it exists. From 0925fecc557471b6f6a488c3590a275151210572 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 12 Jul 2021 12:58:49 -0700 Subject: [PATCH 6/7] xfs: fix an integer overflow error in xfs_growfs_rt During a realtime grow operation, we run a single transaction for each rt bitmap block added to the filesystem. This means that each step has to be careful to increase sb_rblocks appropriately. Fix the integer overflow error in this calculation that can happen when the extent size is very large. Found by running growfs to add a rt volume to a filesystem formatted with a 1g rt extent size. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_rtalloc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 8f6a05db4468..699066fb9052 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1021,7 +1021,8 @@ xfs_growfs_rt( ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0); bmbno < nrbmblocks; bmbno++) { - xfs_trans_t *tp; + struct xfs_trans *tp; + xfs_rfsblock_t nrblocks_step; *nmp = *mp; nsbp = &nmp->m_sb; @@ -1030,10 +1031,9 @@ xfs_growfs_rt( */ nsbp->sb_rextsize = in->extsize; nsbp->sb_rbmblocks = bmbno + 1; - nsbp->sb_rblocks = - XFS_RTMIN(nrblocks, - nsbp->sb_rbmblocks * NBBY * - nsbp->sb_blocksize * nsbp->sb_rextsize); + nrblocks_step = (bmbno + 1) * NBBY * nsbp->sb_blocksize * + nsbp->sb_rextsize; + nsbp->sb_rblocks = min(nrblocks, nrblocks_step); nsbp->sb_rextents = nsbp->sb_rblocks; do_div(nsbp->sb_rextents, nsbp->sb_rextsize); ASSERT(nsbp->sb_rextents != 0); From b102a46ce16fd5550aed882c3c5b95f50da7992c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 14 Jul 2021 09:03:41 -0700 Subject: [PATCH 7/7] xfs: detect misaligned rtinherit directory extent size hints If we encounter a directory that has been configured to pass on an extent size hint to a new realtime file and the hint isn't an integer multiple of the rt extent size, we should flag the hint for administrative review because that is a misconfiguration (that other parts of the kernel will fix automatically). Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/inode.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 61f90b2c9430..76fbc7ca4cec 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -73,11 +73,25 @@ xchk_inode_extsize( uint16_t flags) { xfs_failaddr_t fa; + uint32_t value = be32_to_cpu(dip->di_extsize); - fa = xfs_inode_validate_extsize(sc->mp, be32_to_cpu(dip->di_extsize), - mode, flags); + fa = xfs_inode_validate_extsize(sc->mp, value, mode, flags); if (fa) xchk_ino_set_corrupt(sc, ino); + + /* + * XFS allows a sysadmin to change the rt extent size when adding a rt + * section to a filesystem after formatting. If there are any + * directories with extszinherit and rtinherit set, the hint could + * become misaligned with the new rextsize. The verifier doesn't check + * this, because we allow rtinherit directories even without an rt + * device. Flag this as an administrative warning since we will clean + * this up eventually. + */ + if ((flags & XFS_DIFLAG_RTINHERIT) && + (flags & XFS_DIFLAG_EXTSZINHERIT) && + value % sc->mp->m_sb.sb_rextsize > 0) + xchk_ino_set_warning(sc, ino); } /*