2018-06-06 10:42:14 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2005-11-02 11:58:39 +08:00
|
|
|
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
|
|
|
|
* All Rights Reserved.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
#include "xfs.h"
|
2005-11-02 11:38:42 +08:00
|
|
|
#include "xfs_fs.h"
|
2013-10-23 07:36:05 +08:00
|
|
|
#include "xfs_shared.h"
|
2013-10-23 07:50:10 +08:00
|
|
|
#include "xfs_format.h"
|
2013-10-23 07:51:50 +08:00
|
|
|
#include "xfs_log_format.h"
|
2013-10-23 07:50:10 +08:00
|
|
|
#include "xfs_trans_resv.h"
|
2005-04-17 06:20:36 +08:00
|
|
|
#include "xfs_sb.h"
|
|
|
|
#include "xfs_mount.h"
|
2013-10-23 07:50:10 +08:00
|
|
|
#include "xfs_trans.h"
|
2005-04-17 06:20:36 +08:00
|
|
|
#include "xfs_error.h"
|
|
|
|
#include "xfs_alloc.h"
|
|
|
|
#include "xfs_fsops.h"
|
|
|
|
#include "xfs_trans_space.h"
|
2013-10-23 07:50:10 +08:00
|
|
|
#include "xfs_log.h"
|
2022-03-30 09:22:01 +08:00
|
|
|
#include "xfs_log_priv.h"
|
2018-05-14 14:10:08 +08:00
|
|
|
#include "xfs_ag.h"
|
2016-10-04 00:11:44 +08:00
|
|
|
#include "xfs_ag_resv.h"
|
2021-08-11 08:00:54 +08:00
|
|
|
#include "xfs_trace.h"
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2021-03-24 10:05:38 +08:00
|
|
|
/*
|
|
|
|
* Write new AG headers to disk. Non-transactional, but need to be
|
|
|
|
* written and completed prior to the growfs transaction being logged.
|
|
|
|
* To do this, we use a delayed write buffer list and wait for
|
|
|
|
* submission and IO completion of the list as a whole. This allows the
|
|
|
|
* IO subsystem to merge all the AG headers in a single AG into a single
|
|
|
|
* IO and hide most of the latency of the IO from us.
|
|
|
|
*
|
|
|
|
* This also means that if we get an error whilst building the buffer
|
|
|
|
* list to write, we can cancel the entire list without having written
|
|
|
|
* anything.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
xfs_resizefs_init_new_ags(
|
|
|
|
struct xfs_trans *tp,
|
|
|
|
struct aghdr_init_data *id,
|
|
|
|
xfs_agnumber_t oagcount,
|
|
|
|
xfs_agnumber_t nagcount,
|
|
|
|
xfs_rfsblock_t delta,
|
2022-07-07 17:07:09 +08:00
|
|
|
struct xfs_perag *last_pag,
|
2021-03-24 10:05:38 +08:00
|
|
|
bool *lastag_extended)
|
|
|
|
{
|
|
|
|
struct xfs_mount *mp = tp->t_mountp;
|
|
|
|
xfs_rfsblock_t nb = mp->m_sb.sb_dblocks + delta;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
*lastag_extended = false;
|
|
|
|
|
|
|
|
INIT_LIST_HEAD(&id->buffer_list);
|
|
|
|
for (id->agno = nagcount - 1;
|
|
|
|
id->agno >= oagcount;
|
|
|
|
id->agno--, delta -= id->agsize) {
|
|
|
|
|
|
|
|
if (id->agno == nagcount - 1)
|
|
|
|
id->agsize = nb - (id->agno *
|
|
|
|
(xfs_rfsblock_t)mp->m_sb.sb_agblocks);
|
|
|
|
else
|
|
|
|
id->agsize = mp->m_sb.sb_agblocks;
|
|
|
|
|
|
|
|
error = xfs_ag_init_headers(mp, id);
|
|
|
|
if (error) {
|
|
|
|
xfs_buf_delwri_cancel(&id->buffer_list);
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
error = xfs_buf_delwri_submit(&id->buffer_list);
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
|
|
|
|
if (delta) {
|
|
|
|
*lastag_extended = true;
|
2022-07-07 17:07:09 +08:00
|
|
|
error = xfs_ag_extend_space(last_pag, tp, delta);
|
2021-03-24 10:05:38 +08:00
|
|
|
}
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2018-05-14 14:10:08 +08:00
|
|
|
* growfs operations
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
static int
|
|
|
|
xfs_growfs_data_private(
|
2021-02-03 10:24:06 +08:00
|
|
|
struct xfs_mount *mp, /* mount point for filesystem */
|
|
|
|
struct xfs_growfs_data *in) /* growfs data input struct */
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2020-12-17 08:07:34 +08:00
|
|
|
struct xfs_buf *bp;
|
2018-05-14 14:10:07 +08:00
|
|
|
int error;
|
2005-04-17 06:20:36 +08:00
|
|
|
xfs_agnumber_t nagcount;
|
|
|
|
xfs_agnumber_t nagimax = 0;
|
2021-02-03 10:24:06 +08:00
|
|
|
xfs_rfsblock_t nb, nb_div, nb_mod;
|
2021-03-24 10:05:39 +08:00
|
|
|
int64_t delta;
|
2023-07-07 09:00:59 +08:00
|
|
|
bool lastag_extended = false;
|
2005-04-17 06:20:36 +08:00
|
|
|
xfs_agnumber_t oagcount;
|
2021-02-03 10:24:06 +08:00
|
|
|
struct xfs_trans *tp;
|
2018-05-14 14:10:06 +08:00
|
|
|
struct aghdr_init_data id = {};
|
2022-07-07 17:07:09 +08:00
|
|
|
struct xfs_perag *last_pag;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
nb = in->newblocks;
|
2021-03-24 10:05:39 +08:00
|
|
|
error = xfs_sb_validate_fsb_count(&mp->m_sb, nb);
|
|
|
|
if (error)
|
2007-05-14 16:24:02 +08:00
|
|
|
return error;
|
2021-03-24 10:05:39 +08:00
|
|
|
|
|
|
|
if (nb > mp->m_sb.sb_dblocks) {
|
|
|
|
error = xfs_buf_read_uncached(mp->m_ddev_targp,
|
2010-09-22 08:47:20 +08:00
|
|
|
XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
|
2014-10-02 07:05:32 +08:00
|
|
|
XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
|
2021-03-24 10:05:39 +08:00
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
xfs_buf_relse(bp);
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2021-02-03 10:24:06 +08:00
|
|
|
nb_div = nb;
|
|
|
|
nb_mod = do_div(nb_div, mp->m_sb.sb_agblocks);
|
2023-06-13 23:49:20 +08:00
|
|
|
if (nb_mod && nb_mod >= XFS_MIN_AG_BLOCKS)
|
|
|
|
nb_div++;
|
|
|
|
else if (nb_mod)
|
|
|
|
nb = nb_div * mp->m_sb.sb_agblocks;
|
|
|
|
|
|
|
|
if (nb_div > XFS_MAX_AGNUMBER + 1) {
|
|
|
|
nb_div = XFS_MAX_AGNUMBER + 1;
|
|
|
|
nb = nb_div * mp->m_sb.sb_agblocks;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2023-06-13 23:49:20 +08:00
|
|
|
nagcount = nb_div;
|
2021-02-03 10:24:06 +08:00
|
|
|
delta = nb - mp->m_sb.sb_dblocks;
|
2021-03-24 10:05:39 +08:00
|
|
|
/*
|
|
|
|
* Reject filesystems with a single AG because they are not
|
|
|
|
* supported, and reject a shrink operation that would cause a
|
|
|
|
* filesystem to become unsupported.
|
|
|
|
*/
|
|
|
|
if (delta < 0 && nagcount < 2)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2023-12-15 03:28:08 +08:00
|
|
|
/* No work to do */
|
|
|
|
if (delta == 0)
|
|
|
|
return 0;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
oagcount = mp->m_sb.sb_agcount;
|
2010-01-11 19:47:44 +08:00
|
|
|
/* allocate the new per-ag structures */
|
|
|
|
if (nagcount > oagcount) {
|
2022-07-07 17:13:02 +08:00
|
|
|
error = xfs_initialize_perag(mp, nagcount, nb, &nagimax);
|
2010-01-11 19:47:44 +08:00
|
|
|
if (error)
|
|
|
|
return error;
|
2021-03-24 10:05:39 +08:00
|
|
|
} else if (nagcount < oagcount) {
|
|
|
|
/* TODO: shrinking the entire AGs hasn't yet completed */
|
|
|
|
return -EINVAL;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2010-01-11 19:47:44 +08:00
|
|
|
|
2023-06-13 09:09:04 +08:00
|
|
|
if (delta > 0)
|
|
|
|
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
|
|
|
|
XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
|
|
|
|
&tp);
|
|
|
|
else
|
|
|
|
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, -delta, 0,
|
|
|
|
0, &tp);
|
2016-04-06 07:19:55 +08:00
|
|
|
if (error)
|
xfs: fix perag leak when growfs fails
During growfs, if new ag in memory has been initialized, however
sb_agcount has not been updated, if an error occurs at this time it
will cause perag leaks as follows, these new AGs will not been freed
during umount , because of these new AGs are not visible(that is
included in mp->m_sb.sb_agcount).
unreferenced object 0xffff88810be40200 (size 512):
comm "xfs_growfs", pid 857, jiffies 4294909093
hex dump (first 32 bytes):
00 c0 c1 05 81 88 ff ff 04 00 00 00 00 00 00 00 ................
01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
backtrace (crc 381741e2):
[<ffffffff8191aef6>] __kmalloc+0x386/0x4f0
[<ffffffff82553e65>] kmem_alloc+0xb5/0x2f0
[<ffffffff8238dac5>] xfs_initialize_perag+0xc5/0x810
[<ffffffff824f679c>] xfs_growfs_data+0x9bc/0xbc0
[<ffffffff8250b90e>] xfs_file_ioctl+0x5fe/0x14d0
[<ffffffff81aa5194>] __x64_sys_ioctl+0x144/0x1c0
[<ffffffff83c3d81f>] do_syscall_64+0x3f/0xe0
[<ffffffff83e00087>] entry_SYSCALL_64_after_hwframe+0x62/0x6a
unreferenced object 0xffff88810be40800 (size 512):
comm "xfs_growfs", pid 857, jiffies 4294909093
hex dump (first 32 bytes):
20 00 00 00 00 00 00 00 57 ef be dc 00 00 00 00 .......W.......
10 08 e4 0b 81 88 ff ff 10 08 e4 0b 81 88 ff ff ................
backtrace (crc bde50e2d):
[<ffffffff8191b43a>] __kmalloc_node+0x3da/0x540
[<ffffffff81814489>] kvmalloc_node+0x99/0x160
[<ffffffff8286acff>] bucket_table_alloc.isra.0+0x5f/0x400
[<ffffffff8286bdc5>] rhashtable_init+0x405/0x760
[<ffffffff8238dda3>] xfs_initialize_perag+0x3a3/0x810
[<ffffffff824f679c>] xfs_growfs_data+0x9bc/0xbc0
[<ffffffff8250b90e>] xfs_file_ioctl+0x5fe/0x14d0
[<ffffffff81aa5194>] __x64_sys_ioctl+0x144/0x1c0
[<ffffffff83c3d81f>] do_syscall_64+0x3f/0xe0
[<ffffffff83e00087>] entry_SYSCALL_64_after_hwframe+0x62/0x6a
Factor out xfs_free_unused_perag_range() from xfs_initialize_perag(),
used for freeing unused perag within a specified range in error handling,
included in the error path of the growfs failure.
Fixes: 1c1c6ebcf528 ("xfs: Replace per-ag array with a radix tree")
Signed-off-by: Long Li <leo.lilong@huawei.com>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
2023-12-15 16:22:34 +08:00
|
|
|
goto out_free_unused_perag;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2022-07-07 17:07:09 +08:00
|
|
|
last_pag = xfs_perag_get(mp, oagcount - 1);
|
2021-03-24 10:05:39 +08:00
|
|
|
if (delta > 0) {
|
|
|
|
error = xfs_resizefs_init_new_ags(tp, &id, oagcount, nagcount,
|
2022-07-07 17:07:09 +08:00
|
|
|
delta, last_pag, &lastag_extended);
|
2021-03-24 10:05:39 +08:00
|
|
|
} else {
|
2022-05-27 08:31:34 +08:00
|
|
|
xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SHRINK,
|
2021-03-24 10:05:39 +08:00
|
|
|
"EXPERIMENTAL online shrink feature in use. Use at your own risk!");
|
|
|
|
|
2022-07-07 17:07:09 +08:00
|
|
|
error = xfs_ag_shrink_space(last_pag, &tp, -delta);
|
2021-03-24 10:05:39 +08:00
|
|
|
}
|
2022-07-07 17:07:09 +08:00
|
|
|
xfs_perag_put(last_pag);
|
2018-05-14 14:10:06 +08:00
|
|
|
if (error)
|
2018-05-14 14:10:07 +08:00
|
|
|
goto out_trans_cancel;
|
2018-05-14 14:10:06 +08:00
|
|
|
|
2010-01-11 19:47:44 +08:00
|
|
|
/*
|
|
|
|
* Update changed superblock fields transactionally. These are not
|
|
|
|
* seen by the rest of the world until the transaction commit applies
|
|
|
|
* them atomically to the superblock.
|
|
|
|
*/
|
2005-04-17 06:20:36 +08:00
|
|
|
if (nagcount > oagcount)
|
|
|
|
xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
|
2021-03-24 10:05:38 +08:00
|
|
|
if (delta)
|
|
|
|
xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS, delta);
|
2018-05-14 14:10:06 +08:00
|
|
|
if (id.nfree)
|
|
|
|
xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, id.nfree);
|
2021-03-24 10:05:37 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Sync sb counters now to reflect the updated values. This is
|
|
|
|
* particularly important for shrink because the write verifier
|
|
|
|
* will fail if sb_fdblocks is ever larger than sb_dblocks.
|
|
|
|
*/
|
2021-08-19 09:46:37 +08:00
|
|
|
if (xfs_has_lazysbcount(mp))
|
2021-03-24 10:05:37 +08:00
|
|
|
xfs_log_sb(tp);
|
|
|
|
|
2015-02-05 08:13:21 +08:00
|
|
|
xfs_trans_set_sync(tp);
|
2015-06-04 11:48:08 +08:00
|
|
|
error = xfs_trans_commit(tp);
|
2010-01-11 19:47:44 +08:00
|
|
|
if (error)
|
2005-04-17 06:20:36 +08:00
|
|
|
return error;
|
2010-01-11 19:47:44 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* New allocation groups fully initialized, so update mount struct */
|
|
|
|
if (nagimax)
|
|
|
|
mp->m_maxagi = nagimax;
|
2011-01-04 08:35:03 +08:00
|
|
|
xfs_set_low_space_thresholds(mp);
|
2016-08-03 09:38:24 +08:00
|
|
|
mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
|
2010-01-11 19:47:44 +08:00
|
|
|
|
2021-03-24 10:05:39 +08:00
|
|
|
if (delta > 0) {
|
|
|
|
/*
|
|
|
|
* If we expanded the last AG, free the per-AG reservation
|
|
|
|
* so we can reinitialize it with the new size.
|
|
|
|
*/
|
|
|
|
if (lastag_extended) {
|
|
|
|
struct xfs_perag *pag;
|
|
|
|
|
|
|
|
pag = xfs_perag_get(mp, id.agno);
|
|
|
|
error = xfs_ag_resv_free(pag);
|
|
|
|
xfs_perag_put(pag);
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Reserve AG metadata blocks. ENOSPC here does not mean there
|
|
|
|
* was a growfs failure, just that there still isn't space for
|
|
|
|
* new user data after the grow has been run.
|
|
|
|
*/
|
|
|
|
error = xfs_fs_reserve_ag_blocks(mp);
|
|
|
|
if (error == -ENOSPC)
|
|
|
|
error = 0;
|
2017-01-04 10:39:33 +08:00
|
|
|
}
|
2018-05-14 14:10:07 +08:00
|
|
|
return error;
|
|
|
|
|
|
|
|
out_trans_cancel:
|
|
|
|
xfs_trans_cancel(tp);
|
xfs: fix perag leak when growfs fails
During growfs, if new ag in memory has been initialized, however
sb_agcount has not been updated, if an error occurs at this time it
will cause perag leaks as follows, these new AGs will not been freed
during umount , because of these new AGs are not visible(that is
included in mp->m_sb.sb_agcount).
unreferenced object 0xffff88810be40200 (size 512):
comm "xfs_growfs", pid 857, jiffies 4294909093
hex dump (first 32 bytes):
00 c0 c1 05 81 88 ff ff 04 00 00 00 00 00 00 00 ................
01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
backtrace (crc 381741e2):
[<ffffffff8191aef6>] __kmalloc+0x386/0x4f0
[<ffffffff82553e65>] kmem_alloc+0xb5/0x2f0
[<ffffffff8238dac5>] xfs_initialize_perag+0xc5/0x810
[<ffffffff824f679c>] xfs_growfs_data+0x9bc/0xbc0
[<ffffffff8250b90e>] xfs_file_ioctl+0x5fe/0x14d0
[<ffffffff81aa5194>] __x64_sys_ioctl+0x144/0x1c0
[<ffffffff83c3d81f>] do_syscall_64+0x3f/0xe0
[<ffffffff83e00087>] entry_SYSCALL_64_after_hwframe+0x62/0x6a
unreferenced object 0xffff88810be40800 (size 512):
comm "xfs_growfs", pid 857, jiffies 4294909093
hex dump (first 32 bytes):
20 00 00 00 00 00 00 00 57 ef be dc 00 00 00 00 .......W.......
10 08 e4 0b 81 88 ff ff 10 08 e4 0b 81 88 ff ff ................
backtrace (crc bde50e2d):
[<ffffffff8191b43a>] __kmalloc_node+0x3da/0x540
[<ffffffff81814489>] kvmalloc_node+0x99/0x160
[<ffffffff8286acff>] bucket_table_alloc.isra.0+0x5f/0x400
[<ffffffff8286bdc5>] rhashtable_init+0x405/0x760
[<ffffffff8238dda3>] xfs_initialize_perag+0x3a3/0x810
[<ffffffff824f679c>] xfs_growfs_data+0x9bc/0xbc0
[<ffffffff8250b90e>] xfs_file_ioctl+0x5fe/0x14d0
[<ffffffff81aa5194>] __x64_sys_ioctl+0x144/0x1c0
[<ffffffff83c3d81f>] do_syscall_64+0x3f/0xe0
[<ffffffff83e00087>] entry_SYSCALL_64_after_hwframe+0x62/0x6a
Factor out xfs_free_unused_perag_range() from xfs_initialize_perag(),
used for freeing unused perag within a specified range in error handling,
included in the error path of the growfs failure.
Fixes: 1c1c6ebcf528 ("xfs: Replace per-ag array with a radix tree")
Signed-off-by: Long Li <leo.lilong@huawei.com>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
2023-12-15 16:22:34 +08:00
|
|
|
out_free_unused_perag:
|
|
|
|
if (nagcount > oagcount)
|
|
|
|
xfs_free_unused_perag_range(mp, oagcount, nagcount);
|
2018-05-14 14:10:07 +08:00
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
xfs_growfs_log_private(
|
2021-02-03 10:24:06 +08:00
|
|
|
struct xfs_mount *mp, /* mount point for filesystem */
|
|
|
|
struct xfs_growfs_log *in) /* growfs log input struct */
|
2018-05-14 14:10:07 +08:00
|
|
|
{
|
|
|
|
xfs_extlen_t nb;
|
|
|
|
|
|
|
|
nb = in->newblocks;
|
|
|
|
if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES))
|
|
|
|
return -EINVAL;
|
|
|
|
if (nb == mp->m_sb.sb_logblocks &&
|
|
|
|
in->isint == (mp->m_sb.sb_logstart != 0))
|
|
|
|
return -EINVAL;
|
|
|
|
/*
|
|
|
|
* Moving the log is hard, need new interfaces to sync
|
|
|
|
* the log first, hold off all activity while moving it.
|
|
|
|
* Can have shorter or longer log in the same space,
|
|
|
|
* or transform internal to external log or vice versa.
|
|
|
|
*/
|
|
|
|
return -ENOSYS;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
xfs_growfs_imaxpct(
|
|
|
|
struct xfs_mount *mp,
|
|
|
|
__u32 imaxpct)
|
|
|
|
{
|
|
|
|
struct xfs_trans *tp;
|
|
|
|
int dpct;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if (imaxpct > 100)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
|
|
|
|
XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
|
|
|
|
dpct = imaxpct - mp->m_sb.sb_imax_pct;
|
|
|
|
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
|
|
|
|
xfs_trans_set_sync(tp);
|
|
|
|
return xfs_trans_commit(tp);
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* protected versions of growfs function acquire and release locks on the mount
|
|
|
|
* point - exported through ioctls: XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG,
|
|
|
|
* XFS_IOC_FSGROWFSRT
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
xfs_growfs_data(
|
2018-05-14 14:10:07 +08:00
|
|
|
struct xfs_mount *mp,
|
|
|
|
struct xfs_growfs_data *in)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2018-05-14 14:10:07 +08:00
|
|
|
int error = 0;
|
2008-11-26 11:20:06 +08:00
|
|
|
|
|
|
|
if (!capable(CAP_SYS_ADMIN))
|
2014-06-25 12:58:08 +08:00
|
|
|
return -EPERM;
|
2007-08-30 15:21:54 +08:00
|
|
|
if (!mutex_trylock(&mp->m_growlock))
|
2014-06-25 12:58:08 +08:00
|
|
|
return -EWOULDBLOCK;
|
2018-05-14 14:10:07 +08:00
|
|
|
|
|
|
|
/* update imaxpct separately to the physical grow of the filesystem */
|
|
|
|
if (in->imaxpct != mp->m_sb.sb_imax_pct) {
|
|
|
|
error = xfs_growfs_imaxpct(mp, in->imaxpct);
|
|
|
|
if (error)
|
|
|
|
goto out_error;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (in->newblocks != mp->m_sb.sb_dblocks) {
|
|
|
|
error = xfs_growfs_data_private(mp, in);
|
|
|
|
if (error)
|
|
|
|
goto out_error;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Post growfs calculations needed to reflect new state in operations */
|
|
|
|
if (mp->m_sb.sb_imax_pct) {
|
|
|
|
uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
|
|
|
|
do_div(icount, 100);
|
2019-06-06 02:19:34 +08:00
|
|
|
M_IGEO(mp)->maxicount = XFS_FSB_TO_INO(mp, icount);
|
2018-05-14 14:10:07 +08:00
|
|
|
} else
|
2019-06-06 02:19:34 +08:00
|
|
|
M_IGEO(mp)->maxicount = 0;
|
2018-05-14 14:10:07 +08:00
|
|
|
|
2018-05-14 14:10:07 +08:00
|
|
|
/* Update secondary superblocks now the physical grow has completed */
|
2018-05-14 14:10:08 +08:00
|
|
|
error = xfs_update_secondary_sbs(mp);
|
2018-05-14 14:10:07 +08:00
|
|
|
|
2018-05-14 14:10:07 +08:00
|
|
|
out_error:
|
2015-02-16 08:49:23 +08:00
|
|
|
/*
|
|
|
|
* Increment the generation unconditionally, the error could be from
|
|
|
|
* updating the secondary superblocks, in which case the new size
|
|
|
|
* is live already.
|
|
|
|
*/
|
|
|
|
mp->m_generation++;
|
2007-08-30 15:21:54 +08:00
|
|
|
mutex_unlock(&mp->m_growlock);
|
2005-04-17 06:20:36 +08:00
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
xfs_growfs_log(
|
|
|
|
xfs_mount_t *mp,
|
2021-02-03 10:24:06 +08:00
|
|
|
struct xfs_growfs_log *in)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
int error;
|
2008-11-26 11:20:06 +08:00
|
|
|
|
|
|
|
if (!capable(CAP_SYS_ADMIN))
|
2014-06-25 12:58:08 +08:00
|
|
|
return -EPERM;
|
2007-08-30 15:21:54 +08:00
|
|
|
if (!mutex_trylock(&mp->m_growlock))
|
2014-06-25 12:58:08 +08:00
|
|
|
return -EWOULDBLOCK;
|
2005-04-17 06:20:36 +08:00
|
|
|
error = xfs_growfs_log_private(mp, in);
|
2007-08-30 15:21:54 +08:00
|
|
|
mutex_unlock(&mp->m_growlock);
|
2005-04-17 06:20:36 +08:00
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Reserve the requested number of blocks if available. Otherwise return
|
|
|
|
* as many as possible to satisfy the request. The actual number
|
2023-12-05 01:40:56 +08:00
|
|
|
* reserved are returned in outval.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
int
|
|
|
|
xfs_reserve_blocks(
|
2023-12-05 01:40:56 +08:00
|
|
|
struct xfs_mount *mp,
|
|
|
|
uint64_t request)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2017-06-17 02:00:05 +08:00
|
|
|
int64_t lcounter, delta;
|
|
|
|
int64_t fdblks_delta = 0;
|
|
|
|
int64_t free;
|
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly
xfs_reserve_blocks() is responsible to update the XFS reserved block
pool count at mount time or based on user request. When the caller
requests to increase the reserve pool, blocks must be allocated from
the global counters such that they are no longer available for
general purpose use. If the requested reserve pool size is too
large, XFS reserves what blocks are available. The implementation
requires looking at the percpu counters and making an educated guess
as to how many blocks to try and allocate from xfs_mod_fdblocks(),
which can return -ENOSPC if the guess was not accurate due to
counters being modified in parallel.
xfs_reserve_blocks() retries the guess in this scenario until the
allocation succeeds or it is determined that there is no space
available in the fs. While not easily reproducible in the current
form, the retry code doesn't actually work correctly if
xfs_mod_fdblocks() actually fails. The problem is that the percpu
calculations use the m_resblks counter to determine how many blocks
to allocate, but unconditionally update m_resblks before the block
allocation has actually succeeded. Therefore, if xfs_mod_fdblocks()
fails, the code jumps to the retry label and uses the already
updated m_resblks value to determine how many blocks to try and
allocate. If the percpu counters previously suggested that the
entire request was available, fdblocks_delta could end up set to 0.
In that case, m_resblks is updated to the requested value, yet no
blocks have been reserved at all.
Refactor xfs_reserve_blocks() to use an explicit loop and make the
code easier to follow. Since we have to drop the spinlock across the
xfs_mod_fdblocks() call, use a delta value for m_resblks as well and
only apply the delta once allocation succeeds.
[dchinner: convert to do {} while() loop]
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 09:53:28 +08:00
|
|
|
int error = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-02-10 15:36:17 +08:00
|
|
|
/*
|
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly
xfs_reserve_blocks() is responsible to update the XFS reserved block
pool count at mount time or based on user request. When the caller
requests to increase the reserve pool, blocks must be allocated from
the global counters such that they are no longer available for
general purpose use. If the requested reserve pool size is too
large, XFS reserves what blocks are available. The implementation
requires looking at the percpu counters and making an educated guess
as to how many blocks to try and allocate from xfs_mod_fdblocks(),
which can return -ENOSPC if the guess was not accurate due to
counters being modified in parallel.
xfs_reserve_blocks() retries the guess in this scenario until the
allocation succeeds or it is determined that there is no space
available in the fs. While not easily reproducible in the current
form, the retry code doesn't actually work correctly if
xfs_mod_fdblocks() actually fails. The problem is that the percpu
calculations use the m_resblks counter to determine how many blocks
to allocate, but unconditionally update m_resblks before the block
allocation has actually succeeded. Therefore, if xfs_mod_fdblocks()
fails, the code jumps to the retry label and uses the already
updated m_resblks value to determine how many blocks to try and
allocate. If the percpu counters previously suggested that the
entire request was available, fdblocks_delta could end up set to 0.
In that case, m_resblks is updated to the requested value, yet no
blocks have been reserved at all.
Refactor xfs_reserve_blocks() to use an explicit loop and make the
code easier to follow. Since we have to drop the spinlock across the
xfs_mod_fdblocks() call, use a delta value for m_resblks as well and
only apply the delta once allocation succeeds.
[dchinner: convert to do {} while() loop]
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 09:53:28 +08:00
|
|
|
* With per-cpu counters, this becomes an interesting problem. we need
|
|
|
|
* to work out if we are freeing or allocation blocks first, then we can
|
|
|
|
* do the modification as necessary.
|
2007-02-10 15:36:17 +08:00
|
|
|
*
|
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly
xfs_reserve_blocks() is responsible to update the XFS reserved block
pool count at mount time or based on user request. When the caller
requests to increase the reserve pool, blocks must be allocated from
the global counters such that they are no longer available for
general purpose use. If the requested reserve pool size is too
large, XFS reserves what blocks are available. The implementation
requires looking at the percpu counters and making an educated guess
as to how many blocks to try and allocate from xfs_mod_fdblocks(),
which can return -ENOSPC if the guess was not accurate due to
counters being modified in parallel.
xfs_reserve_blocks() retries the guess in this scenario until the
allocation succeeds or it is determined that there is no space
available in the fs. While not easily reproducible in the current
form, the retry code doesn't actually work correctly if
xfs_mod_fdblocks() actually fails. The problem is that the percpu
calculations use the m_resblks counter to determine how many blocks
to allocate, but unconditionally update m_resblks before the block
allocation has actually succeeded. Therefore, if xfs_mod_fdblocks()
fails, the code jumps to the retry label and uses the already
updated m_resblks value to determine how many blocks to try and
allocate. If the percpu counters previously suggested that the
entire request was available, fdblocks_delta could end up set to 0.
In that case, m_resblks is updated to the requested value, yet no
blocks have been reserved at all.
Refactor xfs_reserve_blocks() to use an explicit loop and make the
code easier to follow. Since we have to drop the spinlock across the
xfs_mod_fdblocks() call, use a delta value for m_resblks as well and
only apply the delta once allocation succeeds.
[dchinner: convert to do {} while() loop]
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 09:53:28 +08:00
|
|
|
* We do this under the m_sb_lock so that if we are near ENOSPC, we will
|
|
|
|
* hold out any changes while we work out what to do. This means that
|
|
|
|
* the amount of free space can change while we do this, so we need to
|
|
|
|
* retry if we end up trying to reserve more space than is available.
|
2007-02-10 15:36:17 +08:00
|
|
|
*/
|
2007-10-11 15:42:32 +08:00
|
|
|
spin_lock(&mp->m_sb_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If our previous reservation was larger than the current value,
|
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly
xfs_reserve_blocks() is responsible to update the XFS reserved block
pool count at mount time or based on user request. When the caller
requests to increase the reserve pool, blocks must be allocated from
the global counters such that they are no longer available for
general purpose use. If the requested reserve pool size is too
large, XFS reserves what blocks are available. The implementation
requires looking at the percpu counters and making an educated guess
as to how many blocks to try and allocate from xfs_mod_fdblocks(),
which can return -ENOSPC if the guess was not accurate due to
counters being modified in parallel.
xfs_reserve_blocks() retries the guess in this scenario until the
allocation succeeds or it is determined that there is no space
available in the fs. While not easily reproducible in the current
form, the retry code doesn't actually work correctly if
xfs_mod_fdblocks() actually fails. The problem is that the percpu
calculations use the m_resblks counter to determine how many blocks
to allocate, but unconditionally update m_resblks before the block
allocation has actually succeeded. Therefore, if xfs_mod_fdblocks()
fails, the code jumps to the retry label and uses the already
updated m_resblks value to determine how many blocks to try and
allocate. If the percpu counters previously suggested that the
entire request was available, fdblocks_delta could end up set to 0.
In that case, m_resblks is updated to the requested value, yet no
blocks have been reserved at all.
Refactor xfs_reserve_blocks() to use an explicit loop and make the
code easier to follow. Since we have to drop the spinlock across the
xfs_mod_fdblocks() call, use a delta value for m_resblks as well and
only apply the delta once allocation succeeds.
[dchinner: convert to do {} while() loop]
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 09:53:28 +08:00
|
|
|
* then move any unused blocks back to the free pool. Modify the resblks
|
|
|
|
* counters directly since we shouldn't have any problems unreserving
|
|
|
|
* space.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
if (mp->m_resblks > request) {
|
|
|
|
lcounter = mp->m_resblks_avail - request;
|
|
|
|
if (lcounter > 0) { /* release unused blocks */
|
2007-02-10 15:36:17 +08:00
|
|
|
fdblks_delta = lcounter;
|
2005-04-17 06:20:36 +08:00
|
|
|
mp->m_resblks_avail -= lcounter;
|
|
|
|
}
|
|
|
|
mp->m_resblks = request;
|
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly
xfs_reserve_blocks() is responsible to update the XFS reserved block
pool count at mount time or based on user request. When the caller
requests to increase the reserve pool, blocks must be allocated from
the global counters such that they are no longer available for
general purpose use. If the requested reserve pool size is too
large, XFS reserves what blocks are available. The implementation
requires looking at the percpu counters and making an educated guess
as to how many blocks to try and allocate from xfs_mod_fdblocks(),
which can return -ENOSPC if the guess was not accurate due to
counters being modified in parallel.
xfs_reserve_blocks() retries the guess in this scenario until the
allocation succeeds or it is determined that there is no space
available in the fs. While not easily reproducible in the current
form, the retry code doesn't actually work correctly if
xfs_mod_fdblocks() actually fails. The problem is that the percpu
calculations use the m_resblks counter to determine how many blocks
to allocate, but unconditionally update m_resblks before the block
allocation has actually succeeded. Therefore, if xfs_mod_fdblocks()
fails, the code jumps to the retry label and uses the already
updated m_resblks value to determine how many blocks to try and
allocate. If the percpu counters previously suggested that the
entire request was available, fdblocks_delta could end up set to 0.
In that case, m_resblks is updated to the requested value, yet no
blocks have been reserved at all.
Refactor xfs_reserve_blocks() to use an explicit loop and make the
code easier to follow. Since we have to drop the spinlock across the
xfs_mod_fdblocks() call, use a delta value for m_resblks as well and
only apply the delta once allocation succeeds.
[dchinner: convert to do {} while() loop]
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 09:53:28 +08:00
|
|
|
if (fdblks_delta) {
|
|
|
|
spin_unlock(&mp->m_sb_lock);
|
|
|
|
error = xfs_mod_fdblocks(mp, fdblks_delta, 0);
|
|
|
|
spin_lock(&mp->m_sb_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
goto out;
|
|
|
|
}
|
2006-09-07 12:26:50 +08:00
|
|
|
|
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly
xfs_reserve_blocks() is responsible to update the XFS reserved block
pool count at mount time or based on user request. When the caller
requests to increase the reserve pool, blocks must be allocated from
the global counters such that they are no longer available for
general purpose use. If the requested reserve pool size is too
large, XFS reserves what blocks are available. The implementation
requires looking at the percpu counters and making an educated guess
as to how many blocks to try and allocate from xfs_mod_fdblocks(),
which can return -ENOSPC if the guess was not accurate due to
counters being modified in parallel.
xfs_reserve_blocks() retries the guess in this scenario until the
allocation succeeds or it is determined that there is no space
available in the fs. While not easily reproducible in the current
form, the retry code doesn't actually work correctly if
xfs_mod_fdblocks() actually fails. The problem is that the percpu
calculations use the m_resblks counter to determine how many blocks
to allocate, but unconditionally update m_resblks before the block
allocation has actually succeeded. Therefore, if xfs_mod_fdblocks()
fails, the code jumps to the retry label and uses the already
updated m_resblks value to determine how many blocks to try and
allocate. If the percpu counters previously suggested that the
entire request was available, fdblocks_delta could end up set to 0.
In that case, m_resblks is updated to the requested value, yet no
blocks have been reserved at all.
Refactor xfs_reserve_blocks() to use an explicit loop and make the
code easier to follow. Since we have to drop the spinlock across the
xfs_mod_fdblocks() call, use a delta value for m_resblks as well and
only apply the delta once allocation succeeds.
[dchinner: convert to do {} while() loop]
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 09:53:28 +08:00
|
|
|
/*
|
|
|
|
* If the request is larger than the current reservation, reserve the
|
|
|
|
* blocks before we update the reserve counters. Sample m_fdblocks and
|
|
|
|
* perform a partial reservation if the request exceeds free space.
|
2022-03-12 02:56:01 +08:00
|
|
|
*
|
|
|
|
* The code below estimates how many blocks it can request from
|
|
|
|
* fdblocks to stash in the reserve pool. This is a classic TOCTOU
|
|
|
|
* race since fdblocks updates are not always coordinated via
|
2022-03-25 03:43:32 +08:00
|
|
|
* m_sb_lock. Set the reserve size even if there's not enough free
|
|
|
|
* space to fill it because mod_fdblocks will refill an undersized
|
|
|
|
* reserve when it can.
|
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly
xfs_reserve_blocks() is responsible to update the XFS reserved block
pool count at mount time or based on user request. When the caller
requests to increase the reserve pool, blocks must be allocated from
the global counters such that they are no longer available for
general purpose use. If the requested reserve pool size is too
large, XFS reserves what blocks are available. The implementation
requires looking at the percpu counters and making an educated guess
as to how many blocks to try and allocate from xfs_mod_fdblocks(),
which can return -ENOSPC if the guess was not accurate due to
counters being modified in parallel.
xfs_reserve_blocks() retries the guess in this scenario until the
allocation succeeds or it is determined that there is no space
available in the fs. While not easily reproducible in the current
form, the retry code doesn't actually work correctly if
xfs_mod_fdblocks() actually fails. The problem is that the percpu
calculations use the m_resblks counter to determine how many blocks
to allocate, but unconditionally update m_resblks before the block
allocation has actually succeeded. Therefore, if xfs_mod_fdblocks()
fails, the code jumps to the retry label and uses the already
updated m_resblks value to determine how many blocks to try and
allocate. If the percpu counters previously suggested that the
entire request was available, fdblocks_delta could end up set to 0.
In that case, m_resblks is updated to the requested value, yet no
blocks have been reserved at all.
Refactor xfs_reserve_blocks() to use an explicit loop and make the
code easier to follow. Since we have to drop the spinlock across the
xfs_mod_fdblocks() call, use a delta value for m_resblks as well and
only apply the delta once allocation succeeds.
[dchinner: convert to do {} while() loop]
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 09:53:28 +08:00
|
|
|
*/
|
2022-03-12 02:56:01 +08:00
|
|
|
free = percpu_counter_sum(&mp->m_fdblocks) -
|
xfs: don't include bnobt blocks when reserving free block pool
xfs_reserve_blocks controls the size of the user-visible free space
reserve pool. Given the difference between the current and requested
pool sizes, it will try to reserve free space from fdblocks. However,
the amount requested from fdblocks is also constrained by the amount of
space that we think xfs_mod_fdblocks will give us. If we forget to
subtract m_allocbt_blks before calling xfs_mod_fdblocks, it will will
return ENOSPC and we'll hang the kernel at mount due to the infinite
loop.
In commit fd43cf600cf6, we decided that xfs_mod_fdblocks should not hand
out the "free space" used by the free space btrees, because some portion
of the free space btrees hold in reserve space for future btree
expansion. Unfortunately, xfs_reserve_blocks' estimation of the number
of blocks that it could request from xfs_mod_fdblocks was not updated to
include m_allocbt_blks, so if space is extremely low, the caller hangs.
Fix this by creating a function to estimate the number of blocks that
can be reserved from fdblocks, which needs to exclude the set-aside and
m_allocbt_blks.
Found by running xfs/306 (which formats a single-AG 20MB filesystem)
with an fstests configuration that specifies a 1k blocksize and a
specially crafted log size that will consume 7/8 of the space (17920
blocks, specifically) in that AG.
Cc: Brian Foster <bfoster@redhat.com>
Fixes: fd43cf600cf6 ("xfs: set aside allocation btree blocks from block reservation")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
2022-03-17 02:54:18 +08:00
|
|
|
xfs_fdblocks_unavailable(mp);
|
2022-03-12 02:56:01 +08:00
|
|
|
delta = request - mp->m_resblks;
|
2022-03-25 03:43:32 +08:00
|
|
|
mp->m_resblks = request;
|
2022-03-12 02:56:01 +08:00
|
|
|
if (delta > 0 && free > 0) {
|
2007-02-10 15:36:17 +08:00
|
|
|
/*
|
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly
xfs_reserve_blocks() is responsible to update the XFS reserved block
pool count at mount time or based on user request. When the caller
requests to increase the reserve pool, blocks must be allocated from
the global counters such that they are no longer available for
general purpose use. If the requested reserve pool size is too
large, XFS reserves what blocks are available. The implementation
requires looking at the percpu counters and making an educated guess
as to how many blocks to try and allocate from xfs_mod_fdblocks(),
which can return -ENOSPC if the guess was not accurate due to
counters being modified in parallel.
xfs_reserve_blocks() retries the guess in this scenario until the
allocation succeeds or it is determined that there is no space
available in the fs. While not easily reproducible in the current
form, the retry code doesn't actually work correctly if
xfs_mod_fdblocks() actually fails. The problem is that the percpu
calculations use the m_resblks counter to determine how many blocks
to allocate, but unconditionally update m_resblks before the block
allocation has actually succeeded. Therefore, if xfs_mod_fdblocks()
fails, the code jumps to the retry label and uses the already
updated m_resblks value to determine how many blocks to try and
allocate. If the percpu counters previously suggested that the
entire request was available, fdblocks_delta could end up set to 0.
In that case, m_resblks is updated to the requested value, yet no
blocks have been reserved at all.
Refactor xfs_reserve_blocks() to use an explicit loop and make the
code easier to follow. Since we have to drop the spinlock across the
xfs_mod_fdblocks() call, use a delta value for m_resblks as well and
only apply the delta once allocation succeeds.
[dchinner: convert to do {} while() loop]
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 09:53:28 +08:00
|
|
|
* We'll either succeed in getting space from the free block
|
2022-03-12 02:56:01 +08:00
|
|
|
* count or we'll get an ENOSPC. Don't set the reserved flag
|
|
|
|
* here - we don't want to reserve the extra reserve blocks
|
|
|
|
* from the reserve.
|
2022-03-25 01:57:07 +08:00
|
|
|
*
|
|
|
|
* The desired reserve size can change after we drop the lock.
|
|
|
|
* Use mod_fdblocks to put the space into the reserve or into
|
|
|
|
* fdblocks as appropriate.
|
2007-02-10 15:36:17 +08:00
|
|
|
*/
|
2022-03-12 02:56:01 +08:00
|
|
|
fdblks_delta = min(free, delta);
|
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly
xfs_reserve_blocks() is responsible to update the XFS reserved block
pool count at mount time or based on user request. When the caller
requests to increase the reserve pool, blocks must be allocated from
the global counters such that they are no longer available for
general purpose use. If the requested reserve pool size is too
large, XFS reserves what blocks are available. The implementation
requires looking at the percpu counters and making an educated guess
as to how many blocks to try and allocate from xfs_mod_fdblocks(),
which can return -ENOSPC if the guess was not accurate due to
counters being modified in parallel.
xfs_reserve_blocks() retries the guess in this scenario until the
allocation succeeds or it is determined that there is no space
available in the fs. While not easily reproducible in the current
form, the retry code doesn't actually work correctly if
xfs_mod_fdblocks() actually fails. The problem is that the percpu
calculations use the m_resblks counter to determine how many blocks
to allocate, but unconditionally update m_resblks before the block
allocation has actually succeeded. Therefore, if xfs_mod_fdblocks()
fails, the code jumps to the retry label and uses the already
updated m_resblks value to determine how many blocks to try and
allocate. If the percpu counters previously suggested that the
entire request was available, fdblocks_delta could end up set to 0.
In that case, m_resblks is updated to the requested value, yet no
blocks have been reserved at all.
Refactor xfs_reserve_blocks() to use an explicit loop and make the
code easier to follow. Since we have to drop the spinlock across the
xfs_mod_fdblocks() call, use a delta value for m_resblks as well and
only apply the delta once allocation succeeds.
[dchinner: convert to do {} while() loop]
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 09:53:28 +08:00
|
|
|
spin_unlock(&mp->m_sb_lock);
|
|
|
|
error = xfs_mod_fdblocks(mp, -fdblks_delta, 0);
|
2022-03-25 03:43:32 +08:00
|
|
|
if (!error)
|
2022-03-25 01:57:07 +08:00
|
|
|
xfs_mod_fdblocks(mp, fdblks_delta, 0);
|
|
|
|
spin_lock(&mp->m_sb_lock);
|
2007-02-10 15:36:17 +08:00
|
|
|
}
|
xfs: refactor xfs_reserve_blocks() to handle ENOSPC correctly
xfs_reserve_blocks() is responsible to update the XFS reserved block
pool count at mount time or based on user request. When the caller
requests to increase the reserve pool, blocks must be allocated from
the global counters such that they are no longer available for
general purpose use. If the requested reserve pool size is too
large, XFS reserves what blocks are available. The implementation
requires looking at the percpu counters and making an educated guess
as to how many blocks to try and allocate from xfs_mod_fdblocks(),
which can return -ENOSPC if the guess was not accurate due to
counters being modified in parallel.
xfs_reserve_blocks() retries the guess in this scenario until the
allocation succeeds or it is determined that there is no space
available in the fs. While not easily reproducible in the current
form, the retry code doesn't actually work correctly if
xfs_mod_fdblocks() actually fails. The problem is that the percpu
calculations use the m_resblks counter to determine how many blocks
to allocate, but unconditionally update m_resblks before the block
allocation has actually succeeded. Therefore, if xfs_mod_fdblocks()
fails, the code jumps to the retry label and uses the already
updated m_resblks value to determine how many blocks to try and
allocate. If the percpu counters previously suggested that the
entire request was available, fdblocks_delta could end up set to 0.
In that case, m_resblks is updated to the requested value, yet no
blocks have been reserved at all.
Refactor xfs_reserve_blocks() to use an explicit loop and make the
code easier to follow. Since we have to drop the spinlock across the
xfs_mod_fdblocks() call, use a delta value for m_resblks as well and
only apply the delta once allocation succeeds.
[dchinner: convert to do {} while() loop]
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-06-21 09:53:28 +08:00
|
|
|
out:
|
|
|
|
spin_unlock(&mp->m_sb_lock);
|
|
|
|
return error;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
xfs_fs_goingdown(
|
|
|
|
xfs_mount_t *mp,
|
2017-06-17 02:00:05 +08:00
|
|
|
uint32_t inflags)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
switch (inflags) {
|
|
|
|
case XFS_FSOP_GOING_FLAGS_DEFAULT: {
|
2023-10-24 21:01:08 +08:00
|
|
|
if (!bdev_freeze(mp->m_super->s_bdev)) {
|
2006-06-09 12:58:38 +08:00
|
|
|
xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
|
2023-10-24 21:01:08 +08:00
|
|
|
bdev_thaw(mp->m_super->s_bdev);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case XFS_FSOP_GOING_FLAGS_LOGFLUSH:
|
2006-06-09 12:58:38 +08:00
|
|
|
xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
|
|
|
case XFS_FSOP_GOING_FLAGS_NOLOGFLUSH:
|
2006-06-09 12:58:38 +08:00
|
|
|
xfs_force_shutdown(mp,
|
|
|
|
SHUTDOWN_FORCE_UMOUNT | SHUTDOWN_LOG_IO_ERROR);
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
|
|
|
default:
|
2014-06-25 12:58:08 +08:00
|
|
|
return -EINVAL;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2012-04-23 13:59:03 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Force a shutdown of the filesystem instantly while keeping the filesystem
|
|
|
|
* consistent. We don't do an unmount here; just shutdown the shop, make sure
|
|
|
|
* that absolutely nothing persistent happens to this filesystem after this
|
|
|
|
* point.
|
2021-08-11 09:00:39 +08:00
|
|
|
*
|
|
|
|
* The shutdown state change is atomic, resulting in the first and only the
|
|
|
|
* first shutdown call processing the shutdown. This means we only shutdown the
|
|
|
|
* log once as it requires, and we don't spam the logs when multiple concurrent
|
|
|
|
* shutdowns race to set the shutdown flags.
|
2012-04-23 13:59:03 +08:00
|
|
|
*/
|
|
|
|
void
|
|
|
|
xfs_do_force_shutdown(
|
2018-10-18 14:20:39 +08:00
|
|
|
struct xfs_mount *mp,
|
2022-04-21 08:47:38 +08:00
|
|
|
uint32_t flags,
|
2012-04-23 13:59:03 +08:00
|
|
|
char *fname,
|
|
|
|
int lnnum)
|
|
|
|
{
|
2021-08-11 09:00:39 +08:00
|
|
|
int tag;
|
|
|
|
const char *why;
|
2012-04-23 13:59:03 +08:00
|
|
|
|
2022-03-30 09:22:01 +08:00
|
|
|
|
|
|
|
if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &mp->m_opstate)) {
|
|
|
|
xlog_shutdown_wait(mp->m_log);
|
2018-10-18 14:20:39 +08:00
|
|
|
return;
|
2022-03-30 09:22:01 +08:00
|
|
|
}
|
2021-08-11 09:00:39 +08:00
|
|
|
if (mp->m_sb_bp)
|
|
|
|
mp->m_sb_bp->b_flags |= XBF_DONE;
|
|
|
|
|
|
|
|
if (flags & SHUTDOWN_FORCE_UMOUNT)
|
|
|
|
xfs_alert(mp, "User initiated shutdown received.");
|
2018-10-18 14:20:39 +08:00
|
|
|
|
2021-08-11 09:00:39 +08:00
|
|
|
if (xlog_force_shutdown(mp->m_log, flags)) {
|
|
|
|
tag = XFS_PTAG_SHUTDOWN_LOGERROR;
|
|
|
|
why = "Log I/O Error";
|
|
|
|
} else if (flags & SHUTDOWN_CORRUPT_INCORE) {
|
|
|
|
tag = XFS_PTAG_SHUTDOWN_CORRUPT;
|
|
|
|
why = "Corruption of in-memory data";
|
2022-06-03 13:37:30 +08:00
|
|
|
} else if (flags & SHUTDOWN_CORRUPT_ONDISK) {
|
|
|
|
tag = XFS_PTAG_SHUTDOWN_CORRUPT;
|
|
|
|
why = "Corruption of on-disk metadata";
|
2023-06-01 17:44:55 +08:00
|
|
|
} else if (flags & SHUTDOWN_DEVICE_REMOVED) {
|
|
|
|
tag = XFS_PTAG_SHUTDOWN_IOERROR;
|
|
|
|
why = "Block device removal";
|
2020-05-07 04:29:19 +08:00
|
|
|
} else {
|
2021-08-11 09:00:39 +08:00
|
|
|
tag = XFS_PTAG_SHUTDOWN_IOERROR;
|
|
|
|
why = "Metadata I/O Error";
|
2012-04-23 13:59:03 +08:00
|
|
|
}
|
2018-10-18 14:20:39 +08:00
|
|
|
|
2021-08-11 08:00:54 +08:00
|
|
|
trace_xfs_force_shutdown(mp, tag, flags, fname, lnnum);
|
|
|
|
|
2021-08-11 09:00:39 +08:00
|
|
|
xfs_alert_tag(mp, tag,
|
|
|
|
"%s (0x%x) detected at %pS (%s:%d). Shutting down filesystem.",
|
|
|
|
why, flags, __return_address, fname, lnnum);
|
2018-10-18 14:20:39 +08:00
|
|
|
xfs_alert(mp,
|
|
|
|
"Please unmount the filesystem and rectify the problem(s)");
|
2021-08-11 09:00:39 +08:00
|
|
|
if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
|
|
|
|
xfs_stack_trace();
|
2012-04-23 13:59:03 +08:00
|
|
|
}
|
2016-10-04 00:11:44 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Reserve free space for per-AG metadata.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
xfs_fs_reserve_ag_blocks(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
|
|
|
xfs_agnumber_t agno;
|
|
|
|
struct xfs_perag *pag;
|
|
|
|
int error = 0;
|
|
|
|
int err2;
|
|
|
|
|
2019-02-14 03:46:16 +08:00
|
|
|
mp->m_finobt_nores = false;
|
2021-06-02 08:48:24 +08:00
|
|
|
for_each_perag(mp, agno, pag) {
|
2018-07-30 13:37:08 +08:00
|
|
|
err2 = xfs_ag_resv_init(pag, NULL);
|
2016-10-04 00:11:44 +08:00
|
|
|
if (err2 && !error)
|
|
|
|
error = err2;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (error && error != -ENOSPC) {
|
|
|
|
xfs_warn(mp,
|
|
|
|
"Error %d reserving per-AG metadata reserve pool.", error);
|
|
|
|
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
|
|
|
|
}
|
|
|
|
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Free space reserved for per-AG metadata.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
xfs_fs_unreserve_ag_blocks(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
|
|
|
xfs_agnumber_t agno;
|
|
|
|
struct xfs_perag *pag;
|
|
|
|
int error = 0;
|
|
|
|
int err2;
|
|
|
|
|
2021-06-02 08:48:24 +08:00
|
|
|
for_each_perag(mp, agno, pag) {
|
2016-10-04 00:11:44 +08:00
|
|
|
err2 = xfs_ag_resv_free(pag);
|
|
|
|
if (err2 && !error)
|
|
|
|
error = err2;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (error)
|
|
|
|
xfs_warn(mp,
|
|
|
|
"Error %d freeing per-AG metadata reserve pool.", error);
|
|
|
|
|
|
|
|
return error;
|
|
|
|
}
|