xfs: lockless buffer cache lookups

Current work to merge the XFS inode life cycle with the VFS inode
 life cycle is finding some interesting issues. If we have a path
 that hits buffer trylocks fairly hard (e.g. a non-blocking
 background inode freeing function), we end up hitting massive
 contention on the buffer cache hash locks:
 
 -   92.71%     0.05%  [kernel]                  [k] xfs_inodegc_worker
    - 92.67% xfs_inodegc_worker
       - 92.13% xfs_inode_unlink
          - 91.52% xfs_inactive_ifree
             - 85.63% xfs_read_agi
                - 85.61% xfs_trans_read_buf_map
                   - 85.59% xfs_buf_read_map
                      - xfs_buf_get_map
                         - 85.55% xfs_buf_find
                            - 72.87% _raw_spin_lock
                               - do_raw_spin_lock
                                    71.86% __pv_queued_spin_lock_slowpath
                            - 8.74% xfs_buf_rele
                               - 7.88% _raw_spin_lock
                                  - 7.88% do_raw_spin_lock
                                       7.63% __pv_queued_spin_lock_slowpath
                            - 1.70% xfs_buf_trylock
                               - 1.68% down_trylock
                                  - 1.41% _raw_spin_lock_irqsave
                                     - 1.39% do_raw_spin_lock
                                          __pv_queued_spin_lock_slowpath
                            - 0.76% _raw_spin_unlock
                                 0.75% do_raw_spin_unlock
 
 This is basically hammering the pag->pag_buf_lock from lots of CPUs
 doing trylocks at the same time. Most of the buffer trylock
 operations ultimately fail after we've done the lookup, so we're
 really hammering the buf hash lock whilst making no progress.
 
 We can also see significant spinlock traffic on the same lock just
 under normal operation when lots of tasks are accessing metadata
 from the same AG, so let's avoid all this by creating a lookup fast
 path which leverages the rhashtable's ability to do RCU protected
 lookups.
 
 Signed-off-by: Dave Chinner <dchinner@redhat.com>
 -----BEGIN PGP SIGNATURE-----
 
 iQJIBAABCgAyFiEEmJOoJ8GffZYWSjj/regpR/R1+h0FAmLPvngUHGRhdmlkQGZy
 b21vcmJpdC5jb20ACgkQregpR/R1+h0gTw/9EK1gj31QpurgGziYsL0JFI1Uq33Z
 2rB/yTJXzxe+J7cE6B2RYuSj4EK7YI1aZXTRC5De5A8TqbFaNztrigqxNNpm3jh0
 T0AbVQoY7XzjbvMHQ0VFPBcJGcVbQypA+rabSlLHfU9zfN3t4EnM+BmuaFqygGZj
 1A6ZjkVChmEprGjd16846sgvMqdLa4yJ4/9Jsu5WlI+vPZj9gJX/7Mjc580Zljb5
 gg9Cf8ziW78gpHzj3ufSWv2jBcWcMdyHpyCF/fNceROUaxmZKsMUDKcsia9TyQhB
 yJXxw9Rnb3F23VJSYMJIcf4+RTd7iqd88GhEEFYxj41gI/jQxqRovlS1ljk2l20R
 3i4TUs7yF24sLLQdL8YkJiGCOEvRqPPcNd4xfGwdioRwXwoEqB7L/vYpUheQ8qSZ
 Tnn4vmGm+GQHNnQNhxiF8KkAd9gwcUslN36ZJn+h3zjvfgAFQFChsk+3CoFoxsth
 BpbFT3lo4Hc6xJBDCp7Z3Gxurxq5fQ2CGYHxCBT4feNkZS5YOLd/Os2hIZVId8XA
 jp66ZyELd8zj+CxMp4ZyYqsFETIao13B8KPEqvI2/obEDE6p/++olP8aqKIP1C8d
 ASOjxP8KqWEHLe3or4W3m2WSDa5fp1b3G/mjS7r/jDKqIuTMZXYw4CJx1x3rTr4F
 nXAnlWoGVq7HjWc=
 =8UYp
 -----END PGP SIGNATURE-----

Merge tag 'xfs-buf-lockless-lookup-5.20' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs into xfs-5.20-mergeB

xfs: lockless buffer cache lookups

Current work to merge the XFS inode life cycle with the VFS inode
life cycle is finding some interesting issues. If we have a path
that hits buffer trylocks fairly hard (e.g. a non-blocking
background inode freeing function), we end up hitting massive
contention on the buffer cache hash locks:

-   92.71%     0.05%  [kernel]                  [k] xfs_inodegc_worker
   - 92.67% xfs_inodegc_worker
      - 92.13% xfs_inode_unlink
         - 91.52% xfs_inactive_ifree
            - 85.63% xfs_read_agi
               - 85.61% xfs_trans_read_buf_map
                  - 85.59% xfs_buf_read_map
                     - xfs_buf_get_map
                        - 85.55% xfs_buf_find
                           - 72.87% _raw_spin_lock
                              - do_raw_spin_lock
                                   71.86% __pv_queued_spin_lock_slowpath
                           - 8.74% xfs_buf_rele
                              - 7.88% _raw_spin_lock
                                 - 7.88% do_raw_spin_lock
                                      7.63% __pv_queued_spin_lock_slowpath
                           - 1.70% xfs_buf_trylock
                              - 1.68% down_trylock
                                 - 1.41% _raw_spin_lock_irqsave
                                    - 1.39% do_raw_spin_lock
                                         __pv_queued_spin_lock_slowpath
                           - 0.76% _raw_spin_unlock
                                0.75% do_raw_spin_unlock

This is basically hammering the pag->pag_buf_lock from lots of CPUs
doing trylocks at the same time. Most of the buffer trylock
operations ultimately fail after we've done the lookup, so we're
really hammering the buf hash lock whilst making no progress.

We can also see significant spinlock traffic on the same lock just
under normal operation when lots of tasks are accessing metadata
from the same AG, so let's avoid all this by creating a lookup fast
path which leverages the rhashtable's ability to do RCU protected
lookups.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>

* tag 'xfs-buf-lockless-lookup-5.20' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs:
  xfs: lockless buffer lookup
  xfs: remove a superflous hash lookup when inserting new buffers
  xfs: reduce the number of atomic when locking a buffer after lookup
  xfs: merge xfs_buf_find() and xfs_buf_get_map()
  xfs: break up xfs_buf_find() into individual pieces
  xfs: rework xfs_buf_incore() API
This commit is contained in:
Darrick J. Wong 2022-07-14 09:22:14 -07:00
commit 35c5a09f53
5 changed files with 188 additions and 135 deletions

View File

@ -543,6 +543,7 @@ xfs_attr_rmtval_stale(
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_buf *bp;
int error;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@ -550,14 +551,18 @@ xfs_attr_rmtval_stale(
XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK))
return -EFSCORRUPTED;
bp = xfs_buf_incore(mp->m_ddev_targp,
error = xfs_buf_incore(mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, map->br_startblock),
XFS_FSB_TO_BB(mp, map->br_blockcount), incore_flags);
if (bp) {
xfs_buf_stale(bp);
xfs_buf_relse(bp);
XFS_FSB_TO_BB(mp, map->br_blockcount),
incore_flags, &bp);
if (error) {
if (error == -ENOENT)
return 0;
return error;
}
xfs_buf_stale(bp);
xfs_buf_relse(bp);
return 0;
}

View File

@ -454,16 +454,19 @@ xrep_invalidate_blocks(
* assume it's owned by someone else.
*/
for_each_xbitmap_block(fsbno, bmr, n, bitmap) {
int error;
/* Skip AG headers and post-EOFS blocks */
if (!xfs_verify_fsbno(sc->mp, fsbno))
continue;
bp = xfs_buf_incore(sc->mp->m_ddev_targp,
error = xfs_buf_incore(sc->mp->m_ddev_targp,
XFS_FSB_TO_DADDR(sc->mp, fsbno),
XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK);
if (bp) {
xfs_trans_bjoin(sc->tp, bp);
xfs_trans_binval(sc->tp, bp);
}
XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK, &bp);
if (error)
continue;
xfs_trans_bjoin(sc->tp, bp);
xfs_trans_binval(sc->tp, bp);
}
return 0;

View File

@ -294,6 +294,16 @@ xfs_buf_free_pages(
bp->b_flags &= ~_XBF_PAGES;
}
static void
xfs_buf_free_callback(
struct callback_head *cb)
{
struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu);
xfs_buf_free_maps(bp);
kmem_cache_free(xfs_buf_cache, bp);
}
static void
xfs_buf_free(
struct xfs_buf *bp)
@ -307,8 +317,7 @@ xfs_buf_free(
else if (bp->b_flags & _XBF_KMEM)
kmem_free(bp->b_addr);
xfs_buf_free_maps(bp);
kmem_cache_free(xfs_buf_cache, bp);
call_rcu(&bp->b_rcu, xfs_buf_free_callback);
}
static int
@ -503,100 +512,45 @@ xfs_buf_hash_destroy(
rhashtable_destroy(&pag->pag_buf_hash);
}
/*
* Look up a buffer in the buffer cache and return it referenced and locked
* in @found_bp.
*
* If @new_bp is supplied and we have a lookup miss, insert @new_bp into the
* cache.
*
* If XBF_TRYLOCK is set in @flags, only try to lock the buffer and return
* -EAGAIN if we fail to lock it.
*
* Return values are:
* -EFSCORRUPTED if have been supplied with an invalid address
* -EAGAIN on trylock failure
* -ENOENT if we fail to find a match and @new_bp was NULL
* 0, with @found_bp:
* - @new_bp if we inserted it into the cache
* - the buffer we found and locked.
*/
static int
xfs_buf_find(
xfs_buf_map_verify(
struct xfs_buftarg *btp,
struct xfs_buf_map *map,
int nmaps,
xfs_buf_flags_t flags,
struct xfs_buf *new_bp,
struct xfs_buf **found_bp)
struct xfs_buf_map *map)
{
struct xfs_perag *pag;
struct xfs_buf *bp;
struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn };
xfs_daddr_t eofs;
int i;
*found_bp = NULL;
for (i = 0; i < nmaps; i++)
cmap.bm_len += map[i].bm_len;
/* Check for IOs smaller than the sector size / not sector aligned */
ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize));
ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize));
ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
/*
* Corrupted block numbers can get through to here, unfortunately, so we
* have to check that the buffer falls within the filesystem bounds.
*/
eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) {
if (map->bm_bn < 0 || map->bm_bn >= eofs) {
xfs_alert(btp->bt_mount,
"%s: daddr 0x%llx out of range, EOFS 0x%llx",
__func__, cmap.bm_bn, eofs);
__func__, map->bm_bn, eofs);
WARN_ON(1);
return -EFSCORRUPTED;
}
pag = xfs_perag_get(btp->bt_mount,
xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
spin_lock(&pag->pag_buf_lock);
bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap,
xfs_buf_hash_params);
if (bp) {
atomic_inc(&bp->b_hold);
goto found;
}
/* No match found */
if (!new_bp) {
XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
spin_unlock(&pag->pag_buf_lock);
xfs_perag_put(pag);
return -ENOENT;
}
/* the buffer keeps the perag reference until it is freed */
new_bp->b_pag = pag;
rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head,
xfs_buf_hash_params);
spin_unlock(&pag->pag_buf_lock);
*found_bp = new_bp;
return 0;
}
found:
spin_unlock(&pag->pag_buf_lock);
xfs_perag_put(pag);
if (!xfs_buf_trylock(bp)) {
if (flags & XBF_TRYLOCK) {
xfs_buf_rele(bp);
XFS_STATS_INC(btp->bt_mount, xb_busy_locked);
static int
xfs_buf_find_lock(
struct xfs_buf *bp,
xfs_buf_flags_t flags)
{
if (flags & XBF_TRYLOCK) {
if (!xfs_buf_trylock(bp)) {
XFS_STATS_INC(bp->b_mount, xb_busy_locked);
return -EAGAIN;
}
} else {
xfs_buf_lock(bp);
XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited);
XFS_STATS_INC(bp->b_mount, xb_get_locked_waited);
}
/*
@ -609,57 +563,59 @@ found:
bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
bp->b_ops = NULL;
}
trace_xfs_buf_find(bp, flags, _RET_IP_);
XFS_STATS_INC(btp->bt_mount, xb_get_locked);
*found_bp = bp;
return 0;
}
struct xfs_buf *
xfs_buf_incore(
struct xfs_buftarg *target,
xfs_daddr_t blkno,
size_t numblks,
xfs_buf_flags_t flags)
static inline int
xfs_buf_lookup(
struct xfs_perag *pag,
struct xfs_buf_map *map,
xfs_buf_flags_t flags,
struct xfs_buf **bpp)
{
struct xfs_buf *bp;
struct xfs_buf *bp;
int error;
DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
error = xfs_buf_find(target, &map, 1, flags, NULL, &bp);
if (error)
return NULL;
return bp;
rcu_read_lock();
bp = rhashtable_lookup(&pag->pag_buf_hash, map, xfs_buf_hash_params);
if (!bp || !atomic_inc_not_zero(&bp->b_hold)) {
rcu_read_unlock();
return -ENOENT;
}
rcu_read_unlock();
error = xfs_buf_find_lock(bp, flags);
if (error) {
xfs_buf_rele(bp);
return error;
}
trace_xfs_buf_find(bp, flags, _RET_IP_);
*bpp = bp;
return 0;
}
/*
* Assembles a buffer covering the specified range. The code is optimised for
* cache hits, as metadata intensive workloads will see 3 orders of magnitude
* more hits than misses.
* Insert the new_bp into the hash table. This consumes the perag reference
* taken for the lookup regardless of the result of the insert.
*/
int
xfs_buf_get_map(
struct xfs_buftarg *target,
static int
xfs_buf_find_insert(
struct xfs_buftarg *btp,
struct xfs_perag *pag,
struct xfs_buf_map *cmap,
struct xfs_buf_map *map,
int nmaps,
xfs_buf_flags_t flags,
struct xfs_buf **bpp)
{
struct xfs_buf *bp;
struct xfs_buf *new_bp;
struct xfs_buf *bp;
int error;
*bpp = NULL;
error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp);
if (!error)
goto found;
if (error != -ENOENT)
return error;
error = _xfs_buf_alloc(target, map, nmaps, flags, &new_bp);
error = _xfs_buf_alloc(btp, map, nmaps, flags, &new_bp);
if (error)
return error;
goto out_drop_pag;
/*
* For buffers that fit entirely within a single page, first attempt to
@ -674,18 +630,94 @@ xfs_buf_get_map(
goto out_free_buf;
}
error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp);
if (error)
spin_lock(&pag->pag_buf_lock);
bp = rhashtable_lookup_get_insert_fast(&pag->pag_buf_hash,
&new_bp->b_rhash_head, xfs_buf_hash_params);
if (IS_ERR(bp)) {
error = PTR_ERR(bp);
spin_unlock(&pag->pag_buf_lock);
goto out_free_buf;
}
if (bp) {
/* found an existing buffer */
atomic_inc(&bp->b_hold);
spin_unlock(&pag->pag_buf_lock);
error = xfs_buf_find_lock(bp, flags);
if (error)
xfs_buf_rele(bp);
else
*bpp = bp;
goto out_free_buf;
}
if (bp != new_bp)
xfs_buf_free(new_bp);
/* The new buffer keeps the perag reference until it is freed. */
new_bp->b_pag = pag;
spin_unlock(&pag->pag_buf_lock);
*bpp = new_bp;
return 0;
found:
out_free_buf:
xfs_buf_free(new_bp);
out_drop_pag:
xfs_perag_put(pag);
return error;
}
/*
* Assembles a buffer covering the specified range. The code is optimised for
* cache hits, as metadata intensive workloads will see 3 orders of magnitude
* more hits than misses.
*/
int
xfs_buf_get_map(
struct xfs_buftarg *btp,
struct xfs_buf_map *map,
int nmaps,
xfs_buf_flags_t flags,
struct xfs_buf **bpp)
{
struct xfs_perag *pag;
struct xfs_buf *bp = NULL;
struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn };
int error;
int i;
for (i = 0; i < nmaps; i++)
cmap.bm_len += map[i].bm_len;
error = xfs_buf_map_verify(btp, &cmap);
if (error)
return error;
pag = xfs_perag_get(btp->bt_mount,
xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
error = xfs_buf_lookup(pag, &cmap, flags, &bp);
if (error && error != -ENOENT)
goto out_put_perag;
/* cache hits always outnumber misses by at least 10:1 */
if (unlikely(!bp)) {
XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
if (flags & XBF_INCORE)
goto out_put_perag;
/* xfs_buf_find_insert() consumes the perag reference. */
error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps,
flags, &bp);
if (error)
return error;
} else {
XFS_STATS_INC(btp->bt_mount, xb_get_locked);
xfs_perag_put(pag);
}
/* We do not hold a perag reference anymore. */
if (!bp->b_addr) {
error = _xfs_buf_map_pages(bp, flags);
if (unlikely(error)) {
xfs_warn_ratelimited(target->bt_mount,
xfs_warn_ratelimited(btp->bt_mount,
"%s: failed to map %u pages", __func__,
bp->b_page_count);
xfs_buf_relse(bp);
@ -700,12 +732,13 @@ found:
if (!(flags & XBF_READ))
xfs_buf_ioerror(bp, 0);
XFS_STATS_INC(target->bt_mount, xb_get);
XFS_STATS_INC(btp->bt_mount, xb_get);
trace_xfs_buf_get(bp, flags, _RET_IP_);
*bpp = bp;
return 0;
out_free_buf:
xfs_buf_free(new_bp);
out_put_perag:
xfs_perag_put(pag);
return error;
}

View File

@ -42,9 +42,11 @@ struct xfs_buf;
#define _XBF_DELWRI_Q (1u << 22)/* buffer on a delwri queue */
/* flags used only as arguments to access routines */
#define XBF_INCORE (1u << 29)/* lookup only, return if found in cache */
#define XBF_TRYLOCK (1u << 30)/* lock requested, but do not wait */
#define XBF_UNMAPPED (1u << 31)/* do not map the buffer */
typedef unsigned int xfs_buf_flags_t;
#define XFS_BUF_FLAGS \
@ -63,6 +65,7 @@ typedef unsigned int xfs_buf_flags_t;
{ _XBF_KMEM, "KMEM" }, \
{ _XBF_DELWRI_Q, "DELWRI_Q" }, \
/* The following interface flags should never be set */ \
{ XBF_INCORE, "INCORE" }, \
{ XBF_TRYLOCK, "TRYLOCK" }, \
{ XBF_UNMAPPED, "UNMAPPED" }
@ -193,13 +196,10 @@ struct xfs_buf {
int b_last_error;
const struct xfs_buf_ops *b_ops;
struct rcu_head b_rcu;
};
/* Finding and Reading Buffers */
struct xfs_buf *xfs_buf_incore(struct xfs_buftarg *target,
xfs_daddr_t blkno, size_t numblks,
xfs_buf_flags_t flags);
int xfs_buf_get_map(struct xfs_buftarg *target, struct xfs_buf_map *map,
int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp);
int xfs_buf_read_map(struct xfs_buftarg *target, struct xfs_buf_map *map,
@ -209,6 +209,19 @@ void xfs_buf_readahead_map(struct xfs_buftarg *target,
struct xfs_buf_map *map, int nmaps,
const struct xfs_buf_ops *ops);
static inline int
xfs_buf_incore(
struct xfs_buftarg *target,
xfs_daddr_t blkno,
size_t numblks,
xfs_buf_flags_t flags,
struct xfs_buf **bpp)
{
DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
return xfs_buf_get_map(target, &map, 1, XBF_INCORE | flags, bpp);
}
static inline int
xfs_buf_get(
struct xfs_buftarg *target,

View File

@ -1229,12 +1229,11 @@ xfs_qm_flush_one(
*/
if (!xfs_dqflock_nowait(dqp)) {
/* buf is pinned in-core by delwri list */
bp = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno,
mp->m_quotainfo->qi_dqchunklen, 0);
if (!bp) {
error = -EINVAL;
error = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno,
mp->m_quotainfo->qi_dqchunklen, 0, &bp);
if (error)
goto out_unlock;
}
xfs_buf_unlock(bp);
xfs_buf_delwri_pushbuf(bp, buffer_list);