diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 0c94cde41016..57fcd5917a66 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -114,6 +114,18 @@ xfs_inode_free_callback( kmem_zone_free(xfs_inode_zone, ip); } +static void +__xfs_inode_free( + struct xfs_inode *ip) +{ + /* asserts to verify all state is correct here */ + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!xfs_isiflocked(ip)); + XFS_STATS_DEC(ip->i_mount, vn_active); + + call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); +} + void xfs_inode_free( struct xfs_inode *ip) @@ -129,12 +141,7 @@ xfs_inode_free( ip->i_ino = 0; spin_unlock(&ip->i_flags_lock); - /* asserts to verify all state is correct here */ - ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!xfs_isiflocked(ip)); - XFS_STATS_DEC(ip->i_mount, vn_active); - - call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); + __xfs_inode_free(ip); } /* @@ -772,8 +779,7 @@ __xfs_inode_set_reclaim_tag( if (!pag->pag_ici_reclaimable) { /* propagate the reclaim tag up into the perag radix tree */ spin_lock(&ip->i_mount->m_perag_lock); - radix_tree_tag_set(&ip->i_mount->m_perag_tree, - XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + radix_tree_tag_set(&ip->i_mount->m_perag_tree, pag->pag_agno, XFS_ICI_RECLAIM_TAG); spin_unlock(&ip->i_mount->m_perag_lock); @@ -817,8 +823,7 @@ __xfs_inode_clear_reclaim( if (!pag->pag_ici_reclaimable) { /* clear the reclaim tag from the perag radix tree */ spin_lock(&ip->i_mount->m_perag_lock); - radix_tree_tag_clear(&ip->i_mount->m_perag_tree, - XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + radix_tree_tag_clear(&ip->i_mount->m_perag_tree, pag->pag_agno, XFS_ICI_RECLAIM_TAG); spin_unlock(&ip->i_mount->m_perag_lock); trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno, @@ -929,6 +934,7 @@ xfs_reclaim_inode( int sync_mode) { struct xfs_buf *bp = NULL; + xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */ int error; restart: @@ -993,6 +999,22 @@ restart: xfs_iflock(ip); reclaim: + /* + * Because we use RCU freeing we need to ensure the inode always appears + * to be reclaimed with an invalid inode number when in the free state. + * We do this as early as possible under the ILOCK and flush lock so + * that xfs_iflush_cluster() can be guaranteed to detect races with us + * here. By doing this, we guarantee that once xfs_iflush_cluster has + * locked both the XFS_ILOCK and the flush lock that it will see either + * a valid, flushable inode that will serialise correctly against the + * locks below, or it will see a clean (and invalid) inode that it can + * skip. + */ + spin_lock(&ip->i_flags_lock); + ip->i_flags = XFS_IRECLAIM; + ip->i_ino = 0; + spin_unlock(&ip->i_flags_lock); + xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -1006,7 +1028,7 @@ reclaim: */ spin_lock(&pag->pag_ici_lock); if (!radix_tree_delete(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) + XFS_INO_TO_AGINO(ip->i_mount, ino))) ASSERT(0); __xfs_inode_clear_reclaim(pag, ip); spin_unlock(&pag->pag_ici_lock); @@ -1023,7 +1045,7 @@ reclaim: xfs_qm_dqdetach(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_inode_free(ip); + __xfs_inode_free(ip); return error; out_ifunlock: diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 3cbc9031731b..e3b27982b3b2 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3239,6 +3239,19 @@ xfs_iflush_cluster( continue; } + + /* + * Check the inode number again, just to be certain we are not + * racing with freeing in xfs_reclaim_inode(). See the comments + * in that function for more information as to why the initial + * check is not sufficient. + */ + if (!iq->i_ino) { + xfs_ifunlock(iq); + xfs_iunlock(iq, XFS_ILOCK_SHARED); + continue; + } + /* * arriving here means that this inode can be flushed. First * re-check that it's dirty before flushing.