2018-06-06 10:42:14 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2005-11-02 11:58:39 +08:00
|
|
|
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
|
|
|
|
* All Rights Reserved.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
#include "xfs.h"
|
2005-11-02 11:38:42 +08:00
|
|
|
#include "xfs_fs.h"
|
2013-10-23 07:36:05 +08:00
|
|
|
#include "xfs_shared.h"
|
2013-10-23 07:50:10 +08:00
|
|
|
#include "xfs_format.h"
|
|
|
|
#include "xfs_log_format.h"
|
|
|
|
#include "xfs_trans_resv.h"
|
2005-11-02 11:38:42 +08:00
|
|
|
#include "xfs_bit.h"
|
2005-04-17 06:20:36 +08:00
|
|
|
#include "xfs_sb.h"
|
|
|
|
#include "xfs_mount.h"
|
|
|
|
#include "xfs_inode.h"
|
2013-10-23 07:51:50 +08:00
|
|
|
#include "xfs_dir2.h"
|
2005-11-02 11:38:42 +08:00
|
|
|
#include "xfs_ialloc.h"
|
2005-04-17 06:20:36 +08:00
|
|
|
#include "xfs_alloc.h"
|
|
|
|
#include "xfs_rtalloc.h"
|
|
|
|
#include "xfs_bmap.h"
|
2013-10-23 07:51:50 +08:00
|
|
|
#include "xfs_trans.h"
|
|
|
|
#include "xfs_trans_priv.h"
|
|
|
|
#include "xfs_log.h"
|
xfs: log shutdown triggers should only shut down the log
We've got a mess on our hands.
1. xfs_trans_commit() cannot cancel transactions because the mount is
shut down - that causes dirty, aborted, unlogged log items to sit
unpinned in memory and potentially get written to disk before the
log is shut down. Hence xfs_trans_commit() can only abort
transactions when xlog_is_shutdown() is true.
2. xfs_force_shutdown() is used in places to cause the current
modification to be aborted via xfs_trans_commit() because it may be
impractical or impossible to cancel the transaction directly, and
hence xfs_trans_commit() must cancel transactions when
xfs_is_shutdown() is true in this situation. But we can't do that
because of #1.
3. Log IO errors cause log shutdowns by calling xfs_force_shutdown()
to shut down the mount and then the log from log IO completion.
4. xfs_force_shutdown() can result in a log force being issued,
which has to wait for log IO completion before it will mark the log
as shut down. If #3 races with some other shutdown trigger that runs
a log force, we rely on xfs_force_shutdown() silently ignoring #3
and avoiding shutting down the log until the failed log force
completes.
5. To ensure #2 always works, we have to ensure that
xfs_force_shutdown() does not return until the the log is shut down.
But in the case of #4, this will result in a deadlock because the
log Io completion will block waiting for a log force to complete
which is blocked waiting for log IO to complete....
So the very first thing we have to do here to untangle this mess is
dissociate log shutdown triggers from mount shutdowns. We already
have xlog_forced_shutdown, which will atomically transistion to the
log a shutdown state. Due to internal asserts it cannot be called
multiple times, but was done simply because the only place that
could call it was xfs_do_force_shutdown() (i.e. the mount shutdown!)
and that could only call it once and once only. So the first thing
we do is remove the asserts.
We then convert all the internal log shutdown triggers to call
xlog_force_shutdown() directly instead of xfs_force_shutdown(). This
allows the log shutdown triggers to shut down the log without
needing to care about mount based shutdown constraints. This means
we shut down the log independently of the mount and the mount may
not notice this until it's next attempt to read or modify metadata.
At that point (e.g. xfs_trans_commit()) it will see that the log is
shutdown, error out and shutdown the mount.
To ensure that all the unmount behaviours and asserts track
correctly as a result of a log shutdown, propagate the shutdown up
to the mount if it is not already set. This keeps the mount and log
state in sync, and saves a huge amount of hassle where code fails
because of a log shutdown but only checks for mount shutdowns and
hence ends up doing the wrong thing. Cleaning up that mess is
an exercise for another day.
This enables us to address the other problems noted above in
followup patches.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2022-03-30 09:22:01 +08:00
|
|
|
#include "xfs_log_priv.h"
|
2005-04-17 06:20:36 +08:00
|
|
|
#include "xfs_error.h"
|
|
|
|
#include "xfs_quota.h"
|
|
|
|
#include "xfs_fsops.h"
|
2012-10-08 18:56:09 +08:00
|
|
|
#include "xfs_icache.h"
|
2014-07-15 06:07:01 +08:00
|
|
|
#include "xfs_sysfs.h"
|
2016-08-03 09:36:07 +08:00
|
|
|
#include "xfs_rmap_btree.h"
|
2016-10-04 00:11:18 +08:00
|
|
|
#include "xfs_refcount_btree.h"
|
2016-10-04 00:11:39 +08:00
|
|
|
#include "xfs_reflink.h"
|
2017-02-08 06:06:57 +08:00
|
|
|
#include "xfs_extent_busy.h"
|
2019-04-12 22:41:15 +08:00
|
|
|
#include "xfs_health.h"
|
2019-12-12 05:19:06 +08:00
|
|
|
#include "xfs_trace.h"
|
2021-06-02 08:48:24 +08:00
|
|
|
#include "xfs_ag.h"
|
2024-04-22 19:20:15 +08:00
|
|
|
#include "xfs_rtbitmap.h"
|
2023-08-10 22:48:07 +08:00
|
|
|
#include "scrub/stats.h"
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-03-30 16:21:31 +08:00
|
|
|
static DEFINE_MUTEX(xfs_uuid_table_mutex);
|
|
|
|
static int xfs_uuid_table_size;
|
|
|
|
static uuid_t *xfs_uuid_table;
|
|
|
|
|
2015-11-03 10:06:34 +08:00
|
|
|
void
|
|
|
|
xfs_uuid_table_free(void)
|
|
|
|
{
|
|
|
|
if (xfs_uuid_table_size == 0)
|
|
|
|
return;
|
2024-01-16 06:59:43 +08:00
|
|
|
kfree(xfs_uuid_table);
|
2015-11-03 10:06:34 +08:00
|
|
|
xfs_uuid_table = NULL;
|
|
|
|
xfs_uuid_table_size = 0;
|
|
|
|
}
|
|
|
|
|
2009-03-30 16:21:31 +08:00
|
|
|
/*
|
|
|
|
* See if the UUID is unique among mounted XFS filesystems.
|
|
|
|
* Mount fails if UUID is nil or a FS with the same UUID is already mounted.
|
|
|
|
*/
|
|
|
|
STATIC int
|
|
|
|
xfs_uuid_mount(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
|
|
|
uuid_t *uuid = &mp->m_sb.sb_uuid;
|
|
|
|
int hole, i;
|
|
|
|
|
2017-04-28 23:10:53 +08:00
|
|
|
/* Publish UUID in struct super_block */
|
2024-02-07 10:56:15 +08:00
|
|
|
super_set_uuid(mp->m_super, uuid->b, sizeof(*uuid));
|
2017-04-28 23:10:53 +08:00
|
|
|
|
2021-08-19 09:46:52 +08:00
|
|
|
if (xfs_has_nouuid(mp))
|
2009-03-30 16:21:31 +08:00
|
|
|
return 0;
|
|
|
|
|
2017-05-04 21:26:23 +08:00
|
|
|
if (uuid_is_null(uuid)) {
|
|
|
|
xfs_warn(mp, "Filesystem has null UUID - can't mount");
|
2014-06-25 12:58:08 +08:00
|
|
|
return -EINVAL;
|
2009-03-30 16:21:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
mutex_lock(&xfs_uuid_table_mutex);
|
|
|
|
for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) {
|
2017-05-04 21:26:23 +08:00
|
|
|
if (uuid_is_null(&xfs_uuid_table[i])) {
|
2009-03-30 16:21:31 +08:00
|
|
|
hole = i;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (uuid_equal(uuid, &xfs_uuid_table[i]))
|
|
|
|
goto out_duplicate;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (hole < 0) {
|
2020-08-27 05:05:56 +08:00
|
|
|
xfs_uuid_table = krealloc(xfs_uuid_table,
|
2009-03-30 16:21:31 +08:00
|
|
|
(xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
|
2020-08-27 05:05:56 +08:00
|
|
|
GFP_KERNEL | __GFP_NOFAIL);
|
2009-03-30 16:21:31 +08:00
|
|
|
hole = xfs_uuid_table_size++;
|
|
|
|
}
|
|
|
|
xfs_uuid_table[hole] = *uuid;
|
|
|
|
mutex_unlock(&xfs_uuid_table_mutex);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
out_duplicate:
|
|
|
|
mutex_unlock(&xfs_uuid_table_mutex);
|
2012-01-13 13:58:39 +08:00
|
|
|
xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid);
|
2014-06-25 12:58:08 +08:00
|
|
|
return -EINVAL;
|
2009-03-30 16:21:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
STATIC void
|
|
|
|
xfs_uuid_unmount(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
|
|
|
uuid_t *uuid = &mp->m_sb.sb_uuid;
|
|
|
|
int i;
|
|
|
|
|
2021-08-19 09:46:52 +08:00
|
|
|
if (xfs_has_nouuid(mp))
|
2009-03-30 16:21:31 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
mutex_lock(&xfs_uuid_table_mutex);
|
|
|
|
for (i = 0; i < xfs_uuid_table_size; i++) {
|
2017-05-04 21:26:23 +08:00
|
|
|
if (uuid_is_null(&xfs_uuid_table[i]))
|
2009-03-30 16:21:31 +08:00
|
|
|
continue;
|
|
|
|
if (!uuid_equal(uuid, &xfs_uuid_table[i]))
|
|
|
|
continue;
|
|
|
|
memset(&xfs_uuid_table[i], 0, sizeof(uuid_t));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
ASSERT(i < xfs_uuid_table_size);
|
|
|
|
mutex_unlock(&xfs_uuid_table_mutex);
|
|
|
|
}
|
|
|
|
|
2007-05-14 16:24:02 +08:00
|
|
|
/*
|
|
|
|
* Check size of device based on the (data/realtime) block count.
|
|
|
|
* Note: this check is used by the growfs code as well as mount.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
xfs_sb_validate_fsb_count(
|
|
|
|
xfs_sb_t *sbp,
|
2017-06-17 02:00:05 +08:00
|
|
|
uint64_t nblocks)
|
2007-05-14 16:24:02 +08:00
|
|
|
{
|
|
|
|
ASSERT(PAGE_SHIFT >= sbp->sb_blocklog);
|
|
|
|
ASSERT(sbp->sb_blocklog >= BBSHIFT);
|
|
|
|
|
2014-07-30 07:12:05 +08:00
|
|
|
/* Limited by ULONG_MAX of page cache index */
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
if (nblocks >> (PAGE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
|
2014-06-25 12:58:08 +08:00
|
|
|
return -EFBIG;
|
2007-05-14 16:24:02 +08:00
|
|
|
return 0;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* xfs_readsb
|
|
|
|
*
|
|
|
|
* Does the initial read of the superblock.
|
|
|
|
*/
|
|
|
|
int
|
2013-08-12 18:49:41 +08:00
|
|
|
xfs_readsb(
|
|
|
|
struct xfs_mount *mp,
|
|
|
|
int flags)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
unsigned int sector_size;
|
2013-04-03 13:11:31 +08:00
|
|
|
struct xfs_buf *bp;
|
|
|
|
struct xfs_sb *sbp = &mp->m_sb;
|
2005-04-17 06:20:36 +08:00
|
|
|
int error;
|
2011-03-07 07:04:35 +08:00
|
|
|
int loud = !(flags & XFS_MFSI_QUIET);
|
xfs: skip verification on initial "guess" superblock read
When xfs_readsb() does the very first read of the superblock,
it makes a guess at the length of the buffer, based on the
sector size of the underlying storage. This may or may
not match the filesystem sector size in sb_sectsize, so
we can't i.e. do a CRC check on it; it might be too short.
In fact, mounting a filesystem with sb_sectsize larger
than the device sector size will cause a mount failure
if CRCs are enabled, because we are checksumming a length
which exceeds the buffer passed to it.
So always read twice; the first time we read with NULL
buffer ops to skip verification; then set the proper
read length, hook up the proper verifier, and give it
another go.
Once we are sure that we've got the right buffer length,
we can also use bp->b_length in the xfs_sb_read_verify,
rather than the less-trusted on-disk sectorsize for
secondary superblocks. Before this we ran the risk of
passing junk to the crc32c routines, which didn't always
handle extreme values.
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-02-19 12:39:16 +08:00
|
|
|
const struct xfs_buf_ops *buf_ops;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
ASSERT(mp->m_sb_bp == NULL);
|
|
|
|
ASSERT(mp->m_ddev_targp != NULL);
|
|
|
|
|
xfs: skip verification on initial "guess" superblock read
When xfs_readsb() does the very first read of the superblock,
it makes a guess at the length of the buffer, based on the
sector size of the underlying storage. This may or may
not match the filesystem sector size in sb_sectsize, so
we can't i.e. do a CRC check on it; it might be too short.
In fact, mounting a filesystem with sb_sectsize larger
than the device sector size will cause a mount failure
if CRCs are enabled, because we are checksumming a length
which exceeds the buffer passed to it.
So always read twice; the first time we read with NULL
buffer ops to skip verification; then set the proper
read length, hook up the proper verifier, and give it
another go.
Once we are sure that we've got the right buffer length,
we can also use bp->b_length in the xfs_sb_read_verify,
rather than the less-trusted on-disk sectorsize for
secondary superblocks. Before this we ran the risk of
passing junk to the crc32c routines, which didn't always
handle extreme values.
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-02-19 12:39:16 +08:00
|
|
|
/*
|
|
|
|
* For the initial read, we must guess at the sector
|
|
|
|
* size based on the block device. It's enough to
|
|
|
|
* get the sb_sectsize out of the superblock and
|
|
|
|
* then reread with the proper length.
|
|
|
|
* We don't verify it yet, because it may not be complete.
|
|
|
|
*/
|
|
|
|
sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
|
|
|
|
buf_ops = NULL;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2016-07-20 09:13:43 +08:00
|
|
|
* Allocate a (locked) buffer to hold the superblock. This will be kept
|
|
|
|
* around at all times to optimize access to the superblock. Therefore,
|
|
|
|
* set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count
|
|
|
|
* elevated.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2010-09-22 08:47:20 +08:00
|
|
|
reread:
|
2014-10-02 07:05:32 +08:00
|
|
|
error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
|
2016-07-20 09:13:43 +08:00
|
|
|
BTOBB(sector_size), XBF_NO_IOACCT, &bp,
|
|
|
|
buf_ops);
|
2014-10-02 07:05:32 +08:00
|
|
|
if (error) {
|
2012-11-12 19:54:02 +08:00
|
|
|
if (loud)
|
2013-04-03 13:11:32 +08:00
|
|
|
xfs_warn(mp, "SB validate failed with error %d.", error);
|
2014-03-07 13:19:14 +08:00
|
|
|
/* bad CRC means corrupted metadata */
|
2014-06-25 12:58:08 +08:00
|
|
|
if (error == -EFSBADCRC)
|
|
|
|
error = -EFSCORRUPTED;
|
2014-10-02 07:05:32 +08:00
|
|
|
return error;
|
2012-11-12 19:54:02 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialize the mount structure from the superblock.
|
|
|
|
*/
|
2020-03-10 23:57:30 +08:00
|
|
|
xfs_sb_from_disk(sbp, bp->b_addr);
|
2014-06-06 14:00:43 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we haven't validated the superblock, do so now before we try
|
|
|
|
* to check the sector size and reread the superblock appropriately.
|
|
|
|
*/
|
|
|
|
if (sbp->sb_magicnum != XFS_SB_MAGIC) {
|
|
|
|
if (loud)
|
|
|
|
xfs_warn(mp, "Invalid superblock magic number");
|
2014-06-25 12:58:08 +08:00
|
|
|
error = -EINVAL;
|
2014-06-06 14:00:43 +08:00
|
|
|
goto release_buf;
|
|
|
|
}
|
2013-08-12 18:49:41 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* We must be able to do sector-sized and sector-aligned IO.
|
|
|
|
*/
|
2013-04-03 13:11:31 +08:00
|
|
|
if (sector_size > sbp->sb_sectsize) {
|
2011-03-07 07:04:35 +08:00
|
|
|
if (loud)
|
|
|
|
xfs_warn(mp, "device supports %u byte sectors (not %u)",
|
2013-04-03 13:11:31 +08:00
|
|
|
sector_size, sbp->sb_sectsize);
|
2014-06-25 12:58:08 +08:00
|
|
|
error = -ENOSYS;
|
2010-09-22 08:47:20 +08:00
|
|
|
goto release_buf;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
xfs: skip verification on initial "guess" superblock read
When xfs_readsb() does the very first read of the superblock,
it makes a guess at the length of the buffer, based on the
sector size of the underlying storage. This may or may
not match the filesystem sector size in sb_sectsize, so
we can't i.e. do a CRC check on it; it might be too short.
In fact, mounting a filesystem with sb_sectsize larger
than the device sector size will cause a mount failure
if CRCs are enabled, because we are checksumming a length
which exceeds the buffer passed to it.
So always read twice; the first time we read with NULL
buffer ops to skip verification; then set the proper
read length, hook up the proper verifier, and give it
another go.
Once we are sure that we've got the right buffer length,
we can also use bp->b_length in the xfs_sb_read_verify,
rather than the less-trusted on-disk sectorsize for
secondary superblocks. Before this we ran the risk of
passing junk to the crc32c routines, which didn't always
handle extreme values.
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-02-19 12:39:16 +08:00
|
|
|
if (buf_ops == NULL) {
|
2014-06-06 14:00:43 +08:00
|
|
|
/*
|
|
|
|
* Re-read the superblock so the buffer is correctly sized,
|
|
|
|
* and properly verified.
|
|
|
|
*/
|
2005-04-17 06:20:36 +08:00
|
|
|
xfs_buf_relse(bp);
|
2013-04-03 13:11:31 +08:00
|
|
|
sector_size = sbp->sb_sectsize;
|
xfs: skip verification on initial "guess" superblock read
When xfs_readsb() does the very first read of the superblock,
it makes a guess at the length of the buffer, based on the
sector size of the underlying storage. This may or may
not match the filesystem sector size in sb_sectsize, so
we can't i.e. do a CRC check on it; it might be too short.
In fact, mounting a filesystem with sb_sectsize larger
than the device sector size will cause a mount failure
if CRCs are enabled, because we are checksumming a length
which exceeds the buffer passed to it.
So always read twice; the first time we read with NULL
buffer ops to skip verification; then set the proper
read length, hook up the proper verifier, and give it
another go.
Once we are sure that we've got the right buffer length,
we can also use bp->b_length in the xfs_sb_read_verify,
rather than the less-trusted on-disk sectorsize for
secondary superblocks. Before this we ran the risk of
passing junk to the crc32c routines, which didn't always
handle extreme values.
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-02-19 12:39:16 +08:00
|
|
|
buf_ops = loud ? &xfs_sb_buf_ops : &xfs_sb_quiet_buf_ops;
|
2010-09-22 08:47:20 +08:00
|
|
|
goto reread;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2021-08-19 09:46:26 +08:00
|
|
|
mp->m_features |= xfs_sb_version_to_features(sbp);
|
2015-02-23 18:22:31 +08:00
|
|
|
xfs_reinit_percpu_counters(mp);
|
2006-03-14 10:13:09 +08:00
|
|
|
|
2024-04-23 00:47:25 +08:00
|
|
|
/*
|
|
|
|
* If logged xattrs are enabled after log recovery finishes, then set
|
|
|
|
* the opstate so that log recovery will work properly.
|
|
|
|
*/
|
|
|
|
if (xfs_sb_version_haslogxattrs(&mp->m_sb))
|
|
|
|
xfs_set_using_logged_xattrs(mp);
|
|
|
|
|
2013-04-03 13:11:31 +08:00
|
|
|
/* no need to be quiet anymore, so reset the buf ops */
|
|
|
|
bp->b_ops = &xfs_sb_buf_ops;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
mp->m_sb_bp = bp;
|
2010-09-22 08:47:20 +08:00
|
|
|
xfs_buf_unlock(bp);
|
2005-04-17 06:20:36 +08:00
|
|
|
return 0;
|
|
|
|
|
2010-09-22 08:47:20 +08:00
|
|
|
release_buf:
|
|
|
|
xfs_buf_relse(bp);
|
2005-04-17 06:20:36 +08:00
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
2019-12-12 05:19:06 +08:00
|
|
|
/*
|
|
|
|
* If the sunit/swidth change would move the precomputed root inode value, we
|
|
|
|
* must reject the ondisk change because repair will stumble over that.
|
|
|
|
* However, we allow the mount to proceed because we never rejected this
|
|
|
|
* combination before. Returns true to update the sb, false otherwise.
|
|
|
|
*/
|
|
|
|
static inline int
|
|
|
|
xfs_check_new_dalign(
|
|
|
|
struct xfs_mount *mp,
|
|
|
|
int new_dalign,
|
|
|
|
bool *update_sb)
|
|
|
|
{
|
|
|
|
struct xfs_sb *sbp = &mp->m_sb;
|
|
|
|
xfs_ino_t calc_ino;
|
|
|
|
|
|
|
|
calc_ino = xfs_ialloc_calc_rootino(mp, new_dalign);
|
|
|
|
trace_xfs_check_new_dalign(mp, new_dalign, calc_ino);
|
|
|
|
|
|
|
|
if (sbp->sb_rootino == calc_ino) {
|
|
|
|
*update_sb = true;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
xfs_warn(mp,
|
|
|
|
"Cannot change stripe alignment; would require moving root inode.");
|
|
|
|
|
|
|
|
/*
|
|
|
|
* XXX: Next time we add a new incompat feature, this should start
|
|
|
|
* returning -EINVAL to fail the mount. Until then, spit out a warning
|
|
|
|
* that we're ignoring the administrator's instructions.
|
|
|
|
*/
|
|
|
|
xfs_warn(mp, "Skipping superblock stripe alignment update.");
|
|
|
|
*update_sb = false;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2019-12-19 03:13:16 +08:00
|
|
|
* If we were provided with new sunit/swidth values as mount options, make sure
|
|
|
|
* that they pass basic alignment and superblock feature checks, and convert
|
|
|
|
* them into the same units (FSB) that everything else expects. This step
|
|
|
|
* /must/ be done before computing the inode geometry.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2007-10-12 09:03:40 +08:00
|
|
|
STATIC int
|
2019-12-19 03:13:16 +08:00
|
|
|
xfs_validate_new_dalign(
|
|
|
|
struct xfs_mount *mp)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2019-12-19 03:13:16 +08:00
|
|
|
if (mp->m_dalign == 0)
|
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2019-12-19 03:13:16 +08:00
|
|
|
/*
|
|
|
|
* If stripe unit and stripe width are not multiples
|
|
|
|
* of the fs blocksize turn off alignment.
|
|
|
|
*/
|
|
|
|
if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
|
|
|
|
(BBTOB(mp->m_swidth) & mp->m_blockmask)) {
|
|
|
|
xfs_warn(mp,
|
|
|
|
"alignment check failed: sunit/swidth vs. blocksize(%d)",
|
|
|
|
mp->m_sb.sb_blocksize);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2022-09-19 04:49:14 +08:00
|
|
|
/*
|
|
|
|
* Convert the stripe unit and width to FSBs.
|
|
|
|
*/
|
|
|
|
mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
|
|
|
|
if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) {
|
|
|
|
xfs_warn(mp,
|
|
|
|
"alignment check failed: sunit/swidth vs. agsize(%d)",
|
|
|
|
mp->m_sb.sb_agblocks);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!mp->m_dalign) {
|
|
|
|
xfs_warn(mp,
|
|
|
|
"alignment check failed: sunit(%d) less than bsize(%d)",
|
|
|
|
mp->m_dalign, mp->m_sb.sb_blocksize);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
|
|
|
|
|
2021-08-19 09:46:37 +08:00
|
|
|
if (!xfs_has_dalign(mp)) {
|
2019-12-19 03:13:16 +08:00
|
|
|
xfs_warn(mp,
|
|
|
|
"cannot change alignment: superblock does not support data alignment");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Update alignment values based on mount options and sb values. */
|
|
|
|
STATIC int
|
|
|
|
xfs_update_alignment(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
|
|
|
struct xfs_sb *sbp = &mp->m_sb;
|
|
|
|
|
|
|
|
if (mp->m_dalign) {
|
2019-12-12 05:19:06 +08:00
|
|
|
bool update_sb;
|
|
|
|
int error;
|
|
|
|
|
2019-12-19 03:13:16 +08:00
|
|
|
if (sbp->sb_unit == mp->m_dalign &&
|
|
|
|
sbp->sb_width == mp->m_swidth)
|
|
|
|
return 0;
|
|
|
|
|
2019-12-12 05:19:06 +08:00
|
|
|
error = xfs_check_new_dalign(mp, mp->m_dalign, &update_sb);
|
|
|
|
if (error || !update_sb)
|
|
|
|
return error;
|
|
|
|
|
2019-12-19 03:13:16 +08:00
|
|
|
sbp->sb_unit = mp->m_dalign;
|
|
|
|
sbp->sb_width = mp->m_swidth;
|
|
|
|
mp->m_update_sb = true;
|
2021-08-19 09:46:52 +08:00
|
|
|
} else if (!xfs_has_noalign(mp) && xfs_has_dalign(mp)) {
|
2019-12-19 03:13:16 +08:00
|
|
|
mp->m_dalign = sbp->sb_unit;
|
|
|
|
mp->m_swidth = sbp->sb_width;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2007-10-12 09:03:40 +08:00
|
|
|
return 0;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-01-04 08:35:03 +08:00
|
|
|
/*
|
|
|
|
* precalculate the low space thresholds for dynamic speculative preallocation.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
xfs_set_low_space_thresholds(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
2021-08-07 02:05:41 +08:00
|
|
|
uint64_t dblocks = mp->m_sb.sb_dblocks;
|
|
|
|
uint64_t rtexts = mp->m_sb.sb_rextents;
|
|
|
|
int i;
|
2011-01-04 08:35:03 +08:00
|
|
|
|
2021-08-07 02:05:41 +08:00
|
|
|
do_div(dblocks, 100);
|
|
|
|
do_div(rtexts, 100);
|
2011-01-04 08:35:03 +08:00
|
|
|
|
2021-08-07 02:05:41 +08:00
|
|
|
for (i = 0; i < XFS_LOWSP_MAX; i++) {
|
|
|
|
mp->m_low_space[i] = dblocks * (i + 1);
|
|
|
|
mp->m_low_rtexts[i] = rtexts * (i + 1);
|
2011-01-04 08:35:03 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2007-10-12 09:03:40 +08:00
|
|
|
/*
|
2013-08-07 18:10:58 +08:00
|
|
|
* Check that the data (and log if separate) is an ok size.
|
2007-10-12 09:03:40 +08:00
|
|
|
*/
|
|
|
|
STATIC int
|
2014-10-02 07:05:32 +08:00
|
|
|
xfs_check_sizes(
|
|
|
|
struct xfs_mount *mp)
|
2007-10-12 09:03:40 +08:00
|
|
|
{
|
2014-10-02 07:05:32 +08:00
|
|
|
struct xfs_buf *bp;
|
2007-10-12 09:03:40 +08:00
|
|
|
xfs_daddr_t d;
|
2014-10-02 07:05:32 +08:00
|
|
|
int error;
|
2007-10-12 09:03:40 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
|
|
|
|
if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_warn(mp, "filesystem size mismatch detected");
|
2014-06-25 12:58:08 +08:00
|
|
|
return -EFBIG;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2014-10-02 07:05:32 +08:00
|
|
|
error = xfs_buf_read_uncached(mp->m_ddev_targp,
|
2010-09-22 08:47:20 +08:00
|
|
|
d - XFS_FSS_TO_BB(mp, 1),
|
2014-10-02 07:05:32 +08:00
|
|
|
XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
|
|
|
|
if (error) {
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_warn(mp, "last sector read failed");
|
2014-10-02 07:05:32 +08:00
|
|
|
return error;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2010-09-22 08:47:20 +08:00
|
|
|
xfs_buf_relse(bp);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2014-10-02 07:05:32 +08:00
|
|
|
if (mp->m_logdev_targp == mp->m_ddev_targp)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
|
|
|
|
if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
|
|
|
|
xfs_warn(mp, "log size mismatch detected");
|
|
|
|
return -EFBIG;
|
|
|
|
}
|
|
|
|
error = xfs_buf_read_uncached(mp->m_logdev_targp,
|
2010-09-22 08:47:20 +08:00
|
|
|
d - XFS_FSB_TO_BB(mp, 1),
|
2014-10-02 07:05:32 +08:00
|
|
|
XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
|
|
|
|
if (error) {
|
|
|
|
xfs_warn(mp, "log device read failed");
|
|
|
|
return error;
|
2007-10-12 09:03:40 +08:00
|
|
|
}
|
2014-10-02 07:05:32 +08:00
|
|
|
xfs_buf_relse(bp);
|
2007-10-12 09:03:40 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-06-08 21:33:32 +08:00
|
|
|
/*
|
|
|
|
* Clear the quotaflags in memory and in the superblock.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
xfs_mount_reset_sbqflags(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
|
|
|
mp->m_qflags = 0;
|
|
|
|
|
2015-01-22 06:10:31 +08:00
|
|
|
/* It is OK to look at sb_qflags in the mount path without m_sb_lock. */
|
2009-06-08 21:33:32 +08:00
|
|
|
if (mp->m_sb.sb_qflags == 0)
|
|
|
|
return 0;
|
|
|
|
spin_lock(&mp->m_sb_lock);
|
|
|
|
mp->m_sb.sb_qflags = 0;
|
|
|
|
spin_unlock(&mp->m_sb_lock);
|
|
|
|
|
2015-01-22 06:10:31 +08:00
|
|
|
if (!xfs_fs_writable(mp, SB_FREEZE_WRITE))
|
2009-06-08 21:33:32 +08:00
|
|
|
return 0;
|
|
|
|
|
2015-01-22 06:10:31 +08:00
|
|
|
return xfs_sync_sb(mp, false);
|
2009-06-08 21:33:32 +08:00
|
|
|
}
|
|
|
|
|
2017-06-17 02:00:05 +08:00
|
|
|
uint64_t
|
2010-02-06 06:59:53 +08:00
|
|
|
xfs_default_resblks(xfs_mount_t *mp)
|
|
|
|
{
|
2017-06-17 02:00:05 +08:00
|
|
|
uint64_t resblks;
|
2010-02-06 06:59:53 +08:00
|
|
|
|
|
|
|
/*
|
2010-03-04 09:46:25 +08:00
|
|
|
* We default to 5% or 8192 fsbs of space reserved, whichever is
|
|
|
|
* smaller. This is intended to cover concurrent allocation
|
|
|
|
* transactions when we initially hit enospc. These each require a 4
|
|
|
|
* block reservation. Hence by default we cover roughly 2000 concurrent
|
|
|
|
* allocation reservations.
|
2010-02-06 06:59:53 +08:00
|
|
|
*/
|
|
|
|
resblks = mp->m_sb.sb_dblocks;
|
|
|
|
do_div(resblks, 20);
|
2017-06-17 02:00:05 +08:00
|
|
|
resblks = min_t(uint64_t, resblks, 8192);
|
2010-02-06 06:59:53 +08:00
|
|
|
return resblks;
|
|
|
|
}
|
|
|
|
|
2018-07-20 03:29:13 +08:00
|
|
|
/* Ensure the summary counts are correct. */
|
|
|
|
STATIC int
|
|
|
|
xfs_check_summary_counts(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
2022-04-12 04:49:42 +08:00
|
|
|
int error = 0;
|
|
|
|
|
2018-07-20 03:29:13 +08:00
|
|
|
/*
|
|
|
|
* The AG0 superblock verifier rejects in-progress filesystems,
|
|
|
|
* so we should never see the flag set this far into mounting.
|
|
|
|
*/
|
|
|
|
if (mp->m_sb.sb_inprogress) {
|
|
|
|
xfs_err(mp, "sb_inprogress set after log recovery??");
|
|
|
|
WARN_ON(1);
|
|
|
|
return -EFSCORRUPTED;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Now the log is mounted, we know if it was an unclean shutdown or
|
|
|
|
* not. If it was, with the first phase of recovery has completed, we
|
|
|
|
* have consistent AG blocks on disk. We have not recovered EFIs yet,
|
|
|
|
* but they are recovered transactionally in the second recovery phase
|
|
|
|
* later.
|
|
|
|
*
|
|
|
|
* If the log was clean when we mounted, we can check the summary
|
|
|
|
* counters. If any of them are obviously incorrect, we can recompute
|
|
|
|
* them from the AGF headers in the next step.
|
|
|
|
*/
|
2021-08-19 09:46:52 +08:00
|
|
|
if (xfs_is_clean(mp) &&
|
2018-07-20 03:29:13 +08:00
|
|
|
(mp->m_sb.sb_fdblocks > mp->m_sb.sb_dblocks ||
|
2018-08-11 08:55:56 +08:00
|
|
|
!xfs_verify_icount(mp, mp->m_sb.sb_icount) ||
|
2018-07-20 03:29:13 +08:00
|
|
|
mp->m_sb.sb_ifree > mp->m_sb.sb_icount))
|
2019-04-12 22:41:15 +08:00
|
|
|
xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
|
2018-07-20 03:29:13 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We can safely re-initialise incore superblock counters from the
|
|
|
|
* per-ag data. These may not be correct if the filesystem was not
|
|
|
|
* cleanly unmounted, so we waited for recovery to finish before doing
|
|
|
|
* this.
|
|
|
|
*
|
|
|
|
* If the filesystem was cleanly unmounted or the previous check did
|
|
|
|
* not flag anything weird, then we can trust the values in the
|
|
|
|
* superblock to be correct and we don't need to do anything here.
|
|
|
|
* Otherwise, recalculate the summary counters.
|
|
|
|
*/
|
2022-04-12 04:49:42 +08:00
|
|
|
if ((xfs_has_lazysbcount(mp) && !xfs_is_clean(mp)) ||
|
|
|
|
xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS)) {
|
|
|
|
error = xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount);
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Older kernels misused sb_frextents to reflect both incore
|
|
|
|
* reservations made by running transactions and the actual count of
|
|
|
|
* free rt extents in the ondisk metadata. Transactions committed
|
|
|
|
* during runtime can therefore contain a superblock update that
|
|
|
|
* undercounts the number of free rt extents tracked in the rt bitmap.
|
|
|
|
* A clean unmount record will have the correct frextents value since
|
|
|
|
* there can be no other transactions running at that point.
|
|
|
|
*
|
|
|
|
* If we're mounting the rt volume after recovering the log, recompute
|
|
|
|
* frextents from the rtbitmap file to fix the inconsistency.
|
|
|
|
*/
|
|
|
|
if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) {
|
|
|
|
error = xfs_rtalloc_reinit_frextents(mp);
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
}
|
2018-07-20 03:29:13 +08:00
|
|
|
|
2022-04-12 04:49:42 +08:00
|
|
|
return 0;
|
2018-07-20 03:29:13 +08:00
|
|
|
}
|
|
|
|
|
xfs: fix sb write verify for lazysbcount
When lazysbcount is enabled, fsstress and loop mount/unmount test report
the following problems:
XFS (loop0): SB summary counter sanity check failed
XFS (loop0): Metadata corruption detected at xfs_sb_write_verify+0x13b/0x460,
xfs_sb block 0x0
XFS (loop0): Unmount and run xfs_repair
XFS (loop0): First 128 bytes of corrupted metadata buffer:
00000000: 58 46 53 42 00 00 10 00 00 00 00 00 00 28 00 00 XFSB.........(..
00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
00000020: 69 fb 7c cd 5f dc 44 af 85 74 e0 cc d4 e3 34 5a i.|._.D..t....4Z
00000030: 00 00 00 00 00 20 00 06 00 00 00 00 00 00 00 80 ..... ..........
00000040: 00 00 00 00 00 00 00 81 00 00 00 00 00 00 00 82 ................
00000050: 00 00 00 01 00 0a 00 00 00 00 00 04 00 00 00 00 ................
00000060: 00 00 0a 00 b4 b5 02 00 02 00 00 08 00 00 00 00 ................
00000070: 00 00 00 00 00 00 00 00 0c 09 09 03 14 00 00 19 ................
XFS (loop0): Corruption of in-memory data (0x8) detected at _xfs_buf_ioapply
+0xe1e/0x10e0 (fs/xfs/xfs_buf.c:1580). Shutting down filesystem.
XFS (loop0): Please unmount the filesystem and rectify the problem(s)
XFS (loop0): log mount/recovery failed: error -117
XFS (loop0): log mount failed
This corruption will shutdown the file system and the file system will
no longer be mountable. The following script can reproduce the problem,
but it may take a long time.
#!/bin/bash
device=/dev/sda
testdir=/mnt/test
round=0
function fail()
{
echo "$*"
exit 1
}
mkdir -p $testdir
while [ $round -lt 10000 ]
do
echo "******* round $round ********"
mkfs.xfs -f $device
mount $device $testdir || fail "mount failed!"
fsstress -d $testdir -l 0 -n 10000 -p 4 >/dev/null &
sleep 4
killall -w fsstress
umount $testdir
xfs_repair -e $device > /dev/null
if [ $? -eq 2 ];then
echo "ERR CODE 2: Dirty log exception during repair."
exit 1
fi
round=$(($round+1))
done
With lazysbcount is enabled, There is no additional lock protection for
reading m_ifree and m_icount in xfs_log_sb(), if other cpu modifies the
m_ifree, this will make the m_ifree greater than m_icount. For example,
consider the following sequence and ifreedelta is postive:
CPU0 CPU1
xfs_log_sb xfs_trans_unreserve_and_mod_sb
---------- ------------------------------
percpu_counter_sum(&mp->m_icount)
percpu_counter_add_batch(&mp->m_icount,
idelta, XFS_ICOUNT_BATCH)
percpu_counter_add(&mp->m_ifree, ifreedelta);
percpu_counter_sum(&mp->m_ifree)
After this, incorrect inode count (sb_ifree > sb_icount) will be writen to
the log. In the subsequent writing of sb, incorrect inode count (sb_ifree >
sb_icount) will fail to pass the boundary check in xfs_validate_sb_write()
that cause the file system shutdown.
When lazysbcount is enabled, we don't need to guarantee that Lazy sb
counters are completely correct, but we do need to guarantee that sb_ifree
<= sb_icount. On the other hand, the constraint that m_ifree <= m_icount
must be satisfied any time that there /cannot/ be other threads allocating
or freeing inode chunks. If the constraint is violated under these
circumstances, sb_i{count,free} (the ondisk superblock inode counters)
maybe incorrect and need to be marked sick at unmount, the count will
be rebuilt on the next mount.
Fixes: 8756a5af1819 ("libxfs: add more bounds checking to sb sanity checks")
Signed-off-by: Long Li <leo.lilong@huawei.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2022-11-17 11:20:20 +08:00
|
|
|
static void
|
|
|
|
xfs_unmount_check(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
|
|
|
if (xfs_is_shutdown(mp))
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (percpu_counter_sum(&mp->m_ifree) >
|
|
|
|
percpu_counter_sum(&mp->m_icount)) {
|
|
|
|
xfs_alert(mp, "ifree/icount mismatch at unmount");
|
|
|
|
xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-03-03 01:32:53 +08:00
|
|
|
/*
|
|
|
|
* Flush and reclaim dirty inodes in preparation for unmount. Inodes and
|
|
|
|
* internal inode structures can be sitting in the CIL and AIL at this point,
|
|
|
|
* so we need to unpin them, write them back and/or reclaim them before unmount
|
2021-08-07 02:05:39 +08:00
|
|
|
* can proceed. In other words, callers are required to have inactivated all
|
|
|
|
* inodes.
|
2021-03-03 01:32:53 +08:00
|
|
|
*
|
|
|
|
* An inode cluster that has been freed can have its buffer still pinned in
|
|
|
|
* memory because the transaction is still sitting in a iclog. The stale inodes
|
|
|
|
* on that buffer will be pinned to the buffer until the transaction hits the
|
|
|
|
* disk and the callbacks run. Pushing the AIL will skip the stale inodes and
|
|
|
|
* may never see the pinned buffer, so nothing will push out the iclog and
|
|
|
|
* unpin the buffer.
|
|
|
|
*
|
|
|
|
* Hence we need to force the log to unpin everything first. However, log
|
|
|
|
* forces don't wait for the discards they issue to complete, so we have to
|
|
|
|
* explicitly wait for them to complete here as well.
|
|
|
|
*
|
|
|
|
* Then we can tell the world we are unmounting so that error handling knows
|
|
|
|
* that the filesystem is going away and we should error out anything that we
|
|
|
|
* have been retrying in the background. This will prevent never-ending
|
|
|
|
* retries in AIL pushing from hanging the unmount.
|
|
|
|
*
|
|
|
|
* Finally, we can push the AIL to clean all the remaining dirty objects, then
|
|
|
|
* reclaim the remaining inodes that are still in memory at this point in time.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
xfs_unmount_flush_inodes(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
|
|
|
xfs_log_force(mp, XFS_LOG_SYNC);
|
|
|
|
xfs_extent_busy_wait_all(mp);
|
|
|
|
flush_workqueue(xfs_discard_wq);
|
|
|
|
|
2021-08-19 09:46:52 +08:00
|
|
|
set_bit(XFS_OPSTATE_UNMOUNTING, &mp->m_opstate);
|
2021-03-03 01:32:53 +08:00
|
|
|
|
|
|
|
xfs_ail_push_all_sync(mp->m_ail);
|
2021-08-07 02:05:39 +08:00
|
|
|
xfs_inodegc_stop(mp);
|
2021-03-03 01:32:53 +08:00
|
|
|
cancel_delayed_work_sync(&mp->m_reclaim_work);
|
|
|
|
xfs_reclaim_inodes(mp);
|
|
|
|
xfs_health_unmount(mp);
|
|
|
|
}
|
|
|
|
|
2021-04-06 22:03:24 +08:00
|
|
|
static void
|
|
|
|
xfs_mount_setup_inode_geom(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
|
|
|
struct xfs_ino_geometry *igeo = M_IGEO(mp);
|
|
|
|
|
|
|
|
igeo->attr_fork_offset = xfs_bmap_compute_attr_offset(mp);
|
|
|
|
ASSERT(igeo->attr_fork_offset < XFS_LITINO(mp));
|
|
|
|
|
|
|
|
xfs_ialloc_setup_geometry(mp);
|
|
|
|
}
|
|
|
|
|
2021-09-17 03:27:34 +08:00
|
|
|
/* Compute maximum possible height for per-AG btree types for this fs. */
|
|
|
|
static inline void
|
|
|
|
xfs_agbtree_compute_maxlevels(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
|
|
|
unsigned int levels;
|
|
|
|
|
|
|
|
levels = max(mp->m_alloc_maxlevels, M_IGEO(mp)->inobt_maxlevels);
|
|
|
|
levels = max(levels, mp->m_rmap_maxlevels);
|
|
|
|
mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels);
|
|
|
|
}
|
|
|
|
|
2007-10-12 09:03:40 +08:00
|
|
|
/*
|
|
|
|
* This function does the following on an initial mount of a file system:
|
|
|
|
* - reads the superblock from disk and init the mount struct
|
|
|
|
* - if we're a 32-bit kernel, do a size check on the superblock
|
|
|
|
* so we don't mount terabyte filesystems
|
|
|
|
* - init mount struct realtime fields
|
|
|
|
* - allocate inode hash table for fs
|
|
|
|
* - init directory manager
|
|
|
|
* - perform recovery and init the log manager
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
xfs_mountfs(
|
2015-08-19 07:58:36 +08:00
|
|
|
struct xfs_mount *mp)
|
2007-10-12 09:03:40 +08:00
|
|
|
{
|
2015-08-19 07:58:36 +08:00
|
|
|
struct xfs_sb *sbp = &(mp->m_sb);
|
|
|
|
struct xfs_inode *rip;
|
2019-06-06 02:19:34 +08:00
|
|
|
struct xfs_ino_geometry *igeo = M_IGEO(mp);
|
2015-08-19 07:58:36 +08:00
|
|
|
uint quotamount = 0;
|
|
|
|
uint quotaflags = 0;
|
|
|
|
int error = 0;
|
2007-10-12 09:03:40 +08:00
|
|
|
|
2013-08-12 18:49:41 +08:00
|
|
|
xfs_sb_mount_common(mp, sbp);
|
2007-10-12 09:03:40 +08:00
|
|
|
|
2008-03-06 10:45:50 +08:00
|
|
|
/*
|
2015-01-22 06:10:33 +08:00
|
|
|
* Check for a mismatched features2 values. Older kernels read & wrote
|
|
|
|
* into the wrong sb offset for sb_features2 on some platforms due to
|
|
|
|
* xfs_sb_t not being 64bit size aligned when sb_features2 was added,
|
|
|
|
* which made older superblock reading/writing routines swap it as a
|
|
|
|
* 64-bit value.
|
2008-03-06 10:45:50 +08:00
|
|
|
*
|
2008-04-10 10:19:34 +08:00
|
|
|
* For backwards compatibility, we make both slots equal.
|
|
|
|
*
|
2015-01-22 06:10:33 +08:00
|
|
|
* If we detect a mismatched field, we OR the set bits into the existing
|
|
|
|
* features2 field in case it has already been modified; we don't want
|
|
|
|
* to lose any features. We then update the bad location with the ORed
|
|
|
|
* value so that older kernels will see any features2 flags. The
|
|
|
|
* superblock writeback code ensures the new sb_features2 is copied to
|
|
|
|
* sb_bad_features2 before it is logged or written to disk.
|
2008-03-06 10:45:50 +08:00
|
|
|
*/
|
2008-04-10 10:19:34 +08:00
|
|
|
if (xfs_sb_has_mismatched_features2(sbp)) {
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_warn(mp, "correcting sb_features alignment problem");
|
2008-03-06 10:45:50 +08:00
|
|
|
sbp->sb_features2 |= sbp->sb_bad_features2;
|
2015-01-22 06:10:31 +08:00
|
|
|
mp->m_update_sb = true;
|
2008-04-30 16:15:28 +08:00
|
|
|
}
|
|
|
|
|
2008-03-06 10:45:50 +08:00
|
|
|
|
2014-05-20 05:46:40 +08:00
|
|
|
/* always use v2 inodes by default now */
|
|
|
|
if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) {
|
|
|
|
mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT;
|
2021-08-19 09:46:37 +08:00
|
|
|
mp->m_features |= XFS_FEAT_NLINK;
|
2015-01-22 06:10:31 +08:00
|
|
|
mp->m_update_sb = true;
|
2014-05-20 05:46:40 +08:00
|
|
|
}
|
|
|
|
|
2007-10-12 09:03:40 +08:00
|
|
|
/*
|
2019-12-19 03:13:16 +08:00
|
|
|
* If we were given new sunit/swidth options, do some basic validation
|
|
|
|
* checks and convert the incore dalign and swidth values to the
|
|
|
|
* same units (FSB) that everything else uses. This /must/ happen
|
|
|
|
* before computing the inode geometry.
|
2007-10-12 09:03:40 +08:00
|
|
|
*/
|
2019-12-19 03:13:16 +08:00
|
|
|
error = xfs_validate_new_dalign(mp);
|
2007-10-12 09:03:40 +08:00
|
|
|
if (error)
|
2009-02-04 16:31:52 +08:00
|
|
|
goto out;
|
2007-10-12 09:03:40 +08:00
|
|
|
|
|
|
|
xfs_alloc_compute_maxlevels(mp);
|
|
|
|
xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
|
|
|
|
xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
|
2021-04-06 22:03:24 +08:00
|
|
|
xfs_mount_setup_inode_geom(mp);
|
2016-08-03 09:36:07 +08:00
|
|
|
xfs_rmapbt_compute_maxlevels(mp);
|
2016-10-04 00:11:18 +08:00
|
|
|
xfs_refcountbt_compute_maxlevels(mp);
|
2007-10-12 09:03:40 +08:00
|
|
|
|
2021-09-17 03:27:34 +08:00
|
|
|
xfs_agbtree_compute_maxlevels(mp);
|
|
|
|
|
2019-12-19 03:13:16 +08:00
|
|
|
/*
|
|
|
|
* Check if sb_agblocks is aligned at stripe boundary. If sb_agblocks
|
|
|
|
* is NOT aligned turn off m_dalign since allocator alignment is within
|
|
|
|
* an ag, therefore ag has to be aligned at stripe boundary. Note that
|
|
|
|
* we must compute the free space and rmap btree geometry before doing
|
|
|
|
* this.
|
|
|
|
*/
|
|
|
|
error = xfs_update_alignment(mp);
|
|
|
|
if (error)
|
|
|
|
goto out;
|
|
|
|
|
2016-05-18 09:11:27 +08:00
|
|
|
/* enable fail_at_unmount as default */
|
2017-10-10 02:38:54 +08:00
|
|
|
mp->m_fail_unmount = true;
|
2016-05-18 09:11:27 +08:00
|
|
|
|
2024-02-07 10:56:20 +08:00
|
|
|
super_set_sysfs_name_id(mp->m_super);
|
|
|
|
|
2019-11-05 05:58:40 +08:00
|
|
|
error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype,
|
|
|
|
NULL, mp->m_super->s_id);
|
2009-03-30 16:21:31 +08:00
|
|
|
if (error)
|
|
|
|
goto out;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2015-10-12 15:21:19 +08:00
|
|
|
error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype,
|
|
|
|
&mp->m_kobj, "stats");
|
2014-07-15 06:07:01 +08:00
|
|
|
if (error)
|
|
|
|
goto out_remove_sysfs;
|
|
|
|
|
2023-08-10 22:48:07 +08:00
|
|
|
xchk_stats_register(mp->m_scrub_stats, mp->m_debugfs);
|
|
|
|
|
2016-05-18 08:58:51 +08:00
|
|
|
error = xfs_error_sysfs_init(mp);
|
2015-10-12 15:21:19 +08:00
|
|
|
if (error)
|
2023-08-10 22:48:07 +08:00
|
|
|
goto out_remove_scrub_stats;
|
2015-10-12 15:21:19 +08:00
|
|
|
|
2017-06-21 08:54:46 +08:00
|
|
|
error = xfs_errortag_init(mp);
|
|
|
|
if (error)
|
|
|
|
goto out_remove_error_sysfs;
|
2016-05-18 08:58:51 +08:00
|
|
|
|
|
|
|
error = xfs_uuid_mount(mp);
|
|
|
|
if (error)
|
2017-06-21 08:54:46 +08:00
|
|
|
goto out_remove_errortag;
|
2016-05-18 08:58:51 +08:00
|
|
|
|
2007-10-12 09:03:40 +08:00
|
|
|
/*
|
2019-10-28 23:41:45 +08:00
|
|
|
* Update the preferred write size based on the information from the
|
|
|
|
* on-disk superblock.
|
2007-10-12 09:03:40 +08:00
|
|
|
*/
|
2019-10-28 23:41:45 +08:00
|
|
|
mp->m_allocsize_log =
|
|
|
|
max_t(uint32_t, sbp->sb_blocklog, mp->m_allocsize_log);
|
|
|
|
mp->m_allocsize_blocks = 1U << (mp->m_allocsize_log - sbp->sb_blocklog);
|
2007-10-12 09:03:40 +08:00
|
|
|
|
2011-01-04 08:35:03 +08:00
|
|
|
/* set the low space thresholds for dynamic preallocation */
|
|
|
|
xfs_set_low_space_thresholds(mp);
|
|
|
|
|
2015-05-29 06:57:27 +08:00
|
|
|
/*
|
|
|
|
* If enabled, sparse inode chunk alignment is expected to match the
|
|
|
|
* cluster size. Full inode chunk alignment must match the chunk size,
|
|
|
|
* but that is checked on sb read verification...
|
|
|
|
*/
|
2021-08-19 09:46:37 +08:00
|
|
|
if (xfs_has_sparseinodes(mp) &&
|
2015-05-29 06:57:27 +08:00
|
|
|
mp->m_sb.sb_spino_align !=
|
2019-06-06 02:19:35 +08:00
|
|
|
XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw)) {
|
2015-05-29 06:57:27 +08:00
|
|
|
xfs_warn(mp,
|
|
|
|
"Sparse inode block alignment (%u) must match cluster size (%llu).",
|
|
|
|
mp->m_sb.sb_spino_align,
|
2019-06-06 02:19:35 +08:00
|
|
|
XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw));
|
2015-05-29 06:57:27 +08:00
|
|
|
error = -EINVAL;
|
|
|
|
goto out_remove_uuid;
|
|
|
|
}
|
|
|
|
|
2007-10-12 09:03:40 +08:00
|
|
|
/*
|
2013-08-12 11:15:03 +08:00
|
|
|
* Check that the data (and log if separate) is an ok size.
|
2007-10-12 09:03:40 +08:00
|
|
|
*/
|
2008-08-13 14:49:32 +08:00
|
|
|
error = xfs_check_sizes(mp);
|
2007-10-12 09:03:40 +08:00
|
|
|
if (error)
|
2009-02-04 16:31:52 +08:00
|
|
|
goto out_remove_uuid;
|
2007-10-12 09:03:40 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Initialize realtime fields in the mount structure
|
|
|
|
*/
|
2007-10-12 09:03:40 +08:00
|
|
|
error = xfs_rtmount_init(mp);
|
|
|
|
if (error) {
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_warn(mp, "RT mount failed");
|
2009-02-04 16:31:52 +08:00
|
|
|
goto out_remove_uuid;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Copies the low order bits of the timestamp and the randomly
|
|
|
|
* set "sequence" number out of a UUID.
|
|
|
|
*/
|
2017-05-05 15:39:10 +08:00
|
|
|
mp->m_fixedfsid[0] =
|
|
|
|
(get_unaligned_be16(&sbp->sb_uuid.b[8]) << 16) |
|
|
|
|
get_unaligned_be16(&sbp->sb_uuid.b[4]);
|
|
|
|
mp->m_fixedfsid[1] = get_unaligned_be32(&sbp->sb_uuid.b[0]);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2014-06-06 13:01:58 +08:00
|
|
|
error = xfs_da_mount(mp);
|
|
|
|
if (error) {
|
|
|
|
xfs_warn(mp, "Failed dir/attr init: %d", error);
|
|
|
|
goto out_remove_uuid;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialize the precomputed transaction reservations values.
|
|
|
|
*/
|
|
|
|
xfs_trans_init(mp);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Allocate and initialize the per-ag data.
|
|
|
|
*/
|
2022-07-07 17:13:02 +08:00
|
|
|
error = xfs_initialize_perag(mp, sbp->sb_agcount, mp->m_sb.sb_dblocks,
|
|
|
|
&mp->m_maxagi);
|
2010-01-11 19:47:44 +08:00
|
|
|
if (error) {
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_warn(mp, "Failed per-ag init: %d", error);
|
2014-06-06 13:01:58 +08:00
|
|
|
goto out_free_dir;
|
2010-01-11 19:47:44 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2019-11-12 04:53:22 +08:00
|
|
|
if (XFS_IS_CORRUPT(mp, !sbp->sb_logblocks)) {
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_warn(mp, "no log defined");
|
2014-06-25 12:58:08 +08:00
|
|
|
error = -EFSCORRUPTED;
|
2009-02-04 16:31:52 +08:00
|
|
|
goto out_free_perag;
|
|
|
|
}
|
|
|
|
|
2021-08-07 02:05:43 +08:00
|
|
|
error = xfs_inodegc_register_shrinker(mp);
|
|
|
|
if (error)
|
|
|
|
goto out_fail_wait;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2015-08-19 07:58:36 +08:00
|
|
|
* Log's mount-time initialization. The first part of recovery can place
|
|
|
|
* some items on the AIL, to be handled when recovery is finished or
|
|
|
|
* cancelled.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2009-02-04 16:31:52 +08:00
|
|
|
error = xfs_log_mount(mp, mp->m_logdev_targp,
|
|
|
|
XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
|
|
|
|
XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
|
|
|
|
if (error) {
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_warn(mp, "log mount failed");
|
2021-08-07 02:05:43 +08:00
|
|
|
goto out_inodegc_shrinker;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2024-04-23 00:47:25 +08:00
|
|
|
/*
|
|
|
|
* If logged xattrs are still enabled after log recovery finishes, then
|
|
|
|
* they'll be available until unmount. Otherwise, turn them off.
|
|
|
|
*/
|
|
|
|
if (xfs_sb_version_haslogxattrs(&mp->m_sb))
|
|
|
|
xfs_set_using_logged_xattrs(mp);
|
|
|
|
else
|
|
|
|
xfs_clear_using_logged_xattrs(mp);
|
|
|
|
|
2021-08-07 02:05:39 +08:00
|
|
|
/* Enable background inode inactivation workers. */
|
|
|
|
xfs_inodegc_start(mp);
|
2021-08-07 02:05:42 +08:00
|
|
|
xfs_blockgc_start(mp);
|
2021-08-07 02:05:39 +08:00
|
|
|
|
xfs: rework attr2 feature and mount options
The attr2 feature is somewhat unique in that it has both a superblock
feature bit to enable it and mount options to enable and disable it.
Back when it was first introduced in 2005, attr2 was disabled unless
either the attr2 superblock feature bit was set, or the attr2 mount
option was set. If the superblock feature bit was not set but the
mount option was set, then when the first attr2 format inode fork
was created, it would set the superblock feature bit. This is as it
should be - the superblock feature bit indicated the presence of the
attr2 on disk format.
The noattr2 mount option, however, did not affect the superblock
feature bit. If noattr2 was specified, the on-disk superblock
feature bit was ignored and the code always just created attr1
format inode forks. If neither of the attr2 or noattr2 mounts
option were specified, then the behaviour was determined by the
superblock feature bit.
This was all pretty sane.
Fast foward 3 years, and we are dealing with fallout from the
botched sb_features2 addition and having to deal with feature
mismatches between the sb_features2 and sb_bad_features2 fields. The
attr2 feature bit was one of these flags. The reconciliation was
done well after mount option parsing and, unfortunately, the feature
reconciliation had a bug where it ignored the noattr2 mount option.
For reasons lost to the mists of time, it was decided that resolving
this issue in commit 7c12f296500e ("[XFS] Fix up noattr2 so that it
will properly update the versionnum and features2 fields.") required
noattr2 to clear the superblock attr2 feature bit. This greatly
complicated the attr2 behaviour and broke rules about feature bits
needing to be set when those specific features are present in the
filesystem.
By complicated, I mean that it introduced problems due to feature
bit interactions with log recovery. All of the superblock feature
bit checks are done prior to log recovery, but if we crash after
removing a feature bit, then on the next mount we see the feature
bit in the unrecovered superblock, only to have it go away after the
log has been replayed. This means our mount time feature processing
could be all wrong.
Hence you can mount with noattr2, crash shortly afterwards, and
mount again without attr2 or noattr2 and still have attr2 enabled
because the second mount sees attr2 still enabled in the superblock
before recovery runs and removes the feature bit. It's just a mess.
Further, this is all legacy code as the v5 format requires attr2 to
be enabled at all times and it cannot be disabled. i.e. the noattr2
mount option returns an error when used on v5 format filesystems.
To straighten this all out, this patch reverts the attr2/noattr2
mount option behaviour back to the original behaviour. There is no
reason for disabling attr2 these days, so we will only do this when
the noattr2 mount option is set. This will not remove the superblock
feature bit. The superblock bit will provide the default behaviour
and only track whether attr2 is present on disk or not. The attr2
mount option will enable the creation of attr2 format inode forks,
and if the superblock feature bit is not set it will be added when
the first attr2 inode fork is created.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2021-08-19 09:46:25 +08:00
|
|
|
/*
|
|
|
|
* Now that we've recovered any pending superblock feature bit
|
|
|
|
* additions, we can finish setting up the attr2 behaviour for the
|
2021-08-19 09:46:52 +08:00
|
|
|
* mount. The noattr2 option overrides the superblock flag, so only
|
|
|
|
* check the superblock feature flag if the mount option is not set.
|
xfs: rework attr2 feature and mount options
The attr2 feature is somewhat unique in that it has both a superblock
feature bit to enable it and mount options to enable and disable it.
Back when it was first introduced in 2005, attr2 was disabled unless
either the attr2 superblock feature bit was set, or the attr2 mount
option was set. If the superblock feature bit was not set but the
mount option was set, then when the first attr2 format inode fork
was created, it would set the superblock feature bit. This is as it
should be - the superblock feature bit indicated the presence of the
attr2 on disk format.
The noattr2 mount option, however, did not affect the superblock
feature bit. If noattr2 was specified, the on-disk superblock
feature bit was ignored and the code always just created attr1
format inode forks. If neither of the attr2 or noattr2 mounts
option were specified, then the behaviour was determined by the
superblock feature bit.
This was all pretty sane.
Fast foward 3 years, and we are dealing with fallout from the
botched sb_features2 addition and having to deal with feature
mismatches between the sb_features2 and sb_bad_features2 fields. The
attr2 feature bit was one of these flags. The reconciliation was
done well after mount option parsing and, unfortunately, the feature
reconciliation had a bug where it ignored the noattr2 mount option.
For reasons lost to the mists of time, it was decided that resolving
this issue in commit 7c12f296500e ("[XFS] Fix up noattr2 so that it
will properly update the versionnum and features2 fields.") required
noattr2 to clear the superblock attr2 feature bit. This greatly
complicated the attr2 behaviour and broke rules about feature bits
needing to be set when those specific features are present in the
filesystem.
By complicated, I mean that it introduced problems due to feature
bit interactions with log recovery. All of the superblock feature
bit checks are done prior to log recovery, but if we crash after
removing a feature bit, then on the next mount we see the feature
bit in the unrecovered superblock, only to have it go away after the
log has been replayed. This means our mount time feature processing
could be all wrong.
Hence you can mount with noattr2, crash shortly afterwards, and
mount again without attr2 or noattr2 and still have attr2 enabled
because the second mount sees attr2 still enabled in the superblock
before recovery runs and removes the feature bit. It's just a mess.
Further, this is all legacy code as the v5 format requires attr2 to
be enabled at all times and it cannot be disabled. i.e. the noattr2
mount option returns an error when used on v5 format filesystems.
To straighten this all out, this patch reverts the attr2/noattr2
mount option behaviour back to the original behaviour. There is no
reason for disabling attr2 these days, so we will only do this when
the noattr2 mount option is set. This will not remove the superblock
feature bit. The superblock bit will provide the default behaviour
and only track whether attr2 is present on disk or not. The attr2
mount option will enable the creation of attr2 format inode forks,
and if the superblock feature bit is not set it will be added when
the first attr2 inode fork is created.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2021-08-19 09:46:25 +08:00
|
|
|
*/
|
2021-08-19 09:46:52 +08:00
|
|
|
if (xfs_has_noattr2(mp)) {
|
|
|
|
mp->m_features &= ~XFS_FEAT_ATTR2;
|
|
|
|
} else if (!xfs_has_attr2(mp) &&
|
|
|
|
(mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT)) {
|
|
|
|
mp->m_features |= XFS_FEAT_ATTR2;
|
|
|
|
}
|
xfs: rework attr2 feature and mount options
The attr2 feature is somewhat unique in that it has both a superblock
feature bit to enable it and mount options to enable and disable it.
Back when it was first introduced in 2005, attr2 was disabled unless
either the attr2 superblock feature bit was set, or the attr2 mount
option was set. If the superblock feature bit was not set but the
mount option was set, then when the first attr2 format inode fork
was created, it would set the superblock feature bit. This is as it
should be - the superblock feature bit indicated the presence of the
attr2 on disk format.
The noattr2 mount option, however, did not affect the superblock
feature bit. If noattr2 was specified, the on-disk superblock
feature bit was ignored and the code always just created attr1
format inode forks. If neither of the attr2 or noattr2 mounts
option were specified, then the behaviour was determined by the
superblock feature bit.
This was all pretty sane.
Fast foward 3 years, and we are dealing with fallout from the
botched sb_features2 addition and having to deal with feature
mismatches between the sb_features2 and sb_bad_features2 fields. The
attr2 feature bit was one of these flags. The reconciliation was
done well after mount option parsing and, unfortunately, the feature
reconciliation had a bug where it ignored the noattr2 mount option.
For reasons lost to the mists of time, it was decided that resolving
this issue in commit 7c12f296500e ("[XFS] Fix up noattr2 so that it
will properly update the versionnum and features2 fields.") required
noattr2 to clear the superblock attr2 feature bit. This greatly
complicated the attr2 behaviour and broke rules about feature bits
needing to be set when those specific features are present in the
filesystem.
By complicated, I mean that it introduced problems due to feature
bit interactions with log recovery. All of the superblock feature
bit checks are done prior to log recovery, but if we crash after
removing a feature bit, then on the next mount we see the feature
bit in the unrecovered superblock, only to have it go away after the
log has been replayed. This means our mount time feature processing
could be all wrong.
Hence you can mount with noattr2, crash shortly afterwards, and
mount again without attr2 or noattr2 and still have attr2 enabled
because the second mount sees attr2 still enabled in the superblock
before recovery runs and removes the feature bit. It's just a mess.
Further, this is all legacy code as the v5 format requires attr2 to
be enabled at all times and it cannot be disabled. i.e. the noattr2
mount option returns an error when used on v5 format filesystems.
To straighten this all out, this patch reverts the attr2/noattr2
mount option behaviour back to the original behaviour. There is no
reason for disabling attr2 these days, so we will only do this when
the noattr2 mount option is set. This will not remove the superblock
feature bit. The superblock bit will provide the default behaviour
and only track whether attr2 is present on disk or not. The attr2
mount option will enable the creation of attr2 format inode forks,
and if the superblock feature bit is not set it will be added when
the first attr2 inode fork is created.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2021-08-19 09:46:25 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Get and sanity-check the root inode.
|
|
|
|
* Save the pointer to it in the mount structure.
|
|
|
|
*/
|
2018-06-06 01:09:33 +08:00
|
|
|
error = xfs_iget(mp, NULL, sbp->sb_rootino, XFS_IGET_UNTRUSTED,
|
|
|
|
XFS_ILOCK_EXCL, &rip);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (error) {
|
2018-06-06 01:09:33 +08:00
|
|
|
xfs_warn(mp,
|
|
|
|
"Failed to read root inode 0x%llx, error %d",
|
|
|
|
sbp->sb_rootino, -error);
|
2009-02-04 16:31:52 +08:00
|
|
|
goto out_log_dealloc;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT(rip != NULL);
|
|
|
|
|
2019-11-12 04:53:22 +08:00
|
|
|
if (XFS_IS_CORRUPT(mp, !S_ISDIR(VFS_I(rip)->i_mode))) {
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_warn(mp, "corrupted root inode %llu: not a directory",
|
2006-06-09 13:29:40 +08:00
|
|
|
(unsigned long long)rip->i_ino);
|
2005-04-17 06:20:36 +08:00
|
|
|
xfs_iunlock(rip, XFS_ILOCK_EXCL);
|
2014-06-25 12:58:08 +08:00
|
|
|
error = -EFSCORRUPTED;
|
2009-02-04 16:31:52 +08:00
|
|
|
goto out_rele_rip;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
mp->m_rootip = rip; /* save it */
|
|
|
|
|
|
|
|
xfs_iunlock(rip, XFS_ILOCK_EXCL);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialize realtime inode pointers in the mount structure
|
|
|
|
*/
|
2007-10-12 09:03:40 +08:00
|
|
|
error = xfs_rtmount_inodes(mp);
|
|
|
|
if (error) {
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Free up the root inode.
|
|
|
|
*/
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_warn(mp, "failed to read RT inodes");
|
2009-02-04 16:31:52 +08:00
|
|
|
goto out_rele_rip;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2022-04-12 04:49:42 +08:00
|
|
|
/* Make sure the summary counts are ok. */
|
|
|
|
error = xfs_check_summary_counts(mp);
|
|
|
|
if (error)
|
|
|
|
goto out_rtunmount;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2009-01-19 09:04:07 +08:00
|
|
|
* If this is a read-only mount defer the superblock updates until
|
|
|
|
* the next remount into writeable mode. Otherwise we would never
|
|
|
|
* perform the update e.g. for the root filesystem.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2021-08-19 09:46:52 +08:00
|
|
|
if (mp->m_update_sb && !xfs_is_readonly(mp)) {
|
2015-01-22 06:10:31 +08:00
|
|
|
error = xfs_sync_sb(mp, false);
|
2008-04-10 10:21:18 +08:00
|
|
|
if (error) {
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_warn(mp, "failed to write sb changes");
|
2009-02-04 16:33:58 +08:00
|
|
|
goto out_rtunmount;
|
2008-04-10 10:21:18 +08:00
|
|
|
}
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialise the XFS quota management subsystem for this mount
|
|
|
|
*/
|
2021-08-07 02:05:37 +08:00
|
|
|
if (XFS_IS_QUOTA_ON(mp)) {
|
2009-06-08 21:33:32 +08:00
|
|
|
error = xfs_qm_newmount(mp, "amount, "aflags);
|
|
|
|
if (error)
|
|
|
|
goto out_rtunmount;
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* If a file system had quotas running earlier, but decided to
|
|
|
|
* mount without -o uquota/pquota/gquota options, revoke the
|
|
|
|
* quotachecked license.
|
|
|
|
*/
|
|
|
|
if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_notice(mp, "resetting quota flags");
|
2009-06-08 21:33:32 +08:00
|
|
|
error = xfs_mount_reset_sbqflags(mp);
|
|
|
|
if (error)
|
2014-07-15 05:41:25 +08:00
|
|
|
goto out_rtunmount;
|
2009-06-08 21:33:32 +08:00
|
|
|
}
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
2015-08-19 07:58:36 +08:00
|
|
|
* Finish recovering the file system. This part needed to be delayed
|
|
|
|
* until after the root and real-time bitmap inodes were consistently
|
2021-06-19 02:57:07 +08:00
|
|
|
* read in. Temporarily create per-AG space reservations for metadata
|
|
|
|
* btree shape changes because space freeing transactions (for inode
|
|
|
|
* inactivation) require the per-AG reservation in lieu of reserving
|
|
|
|
* blocks.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2021-06-19 02:57:07 +08:00
|
|
|
error = xfs_fs_reserve_ag_blocks(mp);
|
|
|
|
if (error && error == -ENOSPC)
|
|
|
|
xfs_warn(mp,
|
|
|
|
"ENOSPC reserving per-AG metadata pool, log recovery may fail.");
|
2008-08-13 14:49:32 +08:00
|
|
|
error = xfs_log_mount_finish(mp);
|
2021-06-19 02:57:07 +08:00
|
|
|
xfs_fs_unreserve_ag_blocks(mp);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (error) {
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_warn(mp, "log mount finish failed");
|
2009-02-04 16:33:58 +08:00
|
|
|
goto out_rtunmount;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
xfs: quiesce the filesystem after recovery on readonly mount
Recently we've had a number of reports where log recovery on a v5
filesystem has reported corruptions that looked to be caused by
recovery being re-run over the top of an already-recovered
metadata. This has uncovered a bug in recovery (fixed elsewhere)
but the vector that caused this was largely unknown.
A kdump test started tripping over this problem - the system
would be crashed, the kdump kernel and environment would boot and
dump the kernel core image, and then the system would reboot. After
reboot, the root filesystem was triggering log recovery and
corruptions were being detected. The metadumps indicated the above
log recovery issue.
What is happening is that the kdump kernel and environment is
mounting the root device read-only to find the binaries needed to do
it's work. The result of this is that it is running log recovery.
However, because there were unlinked files and EFIs to be processed
by recovery, the completion of phase 1 of log recovery could not
mark the log clean. And because it's a read-only mount, the unmount
process does not write records to the log to mark it clean, either.
Hence on the next mount of the filesystem, log recovery was run
again across all the metadata that had already been recovered and
this is what triggered corruption warnings.
To avoid this problem, we need to ensure that a read-only mount
always updates the log when it completes the second phase of
recovery. We already handle this sort of issue with rw->ro remount
transitions, so the solution is as simple as quiescing the
filesystem at the appropriate time during the mount process. This
results in the log being marked clean so the mount behaviour
recorded in the logs on repeated RO mounts will change (i.e. log
recovery will no longer be run on every mount until a RW mount is
done). This is a user visible change in behaviour, but it is
harmless.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-09-26 06:21:44 +08:00
|
|
|
/*
|
|
|
|
* Now the log is fully replayed, we can transition to full read-only
|
|
|
|
* mode for read-only mounts. This will sync all the metadata and clean
|
|
|
|
* the log so that the recovery we just performed does not have to be
|
|
|
|
* replayed again on the next mount.
|
|
|
|
*
|
|
|
|
* We use the same quiesce mechanism as the rw->ro remount, as they are
|
|
|
|
* semantically identical operations.
|
|
|
|
*/
|
2021-08-19 09:46:52 +08:00
|
|
|
if (xfs_is_readonly(mp) && !xfs_has_norecovery(mp))
|
2021-01-23 08:48:24 +08:00
|
|
|
xfs_log_clean(mp);
|
xfs: quiesce the filesystem after recovery on readonly mount
Recently we've had a number of reports where log recovery on a v5
filesystem has reported corruptions that looked to be caused by
recovery being re-run over the top of an already-recovered
metadata. This has uncovered a bug in recovery (fixed elsewhere)
but the vector that caused this was largely unknown.
A kdump test started tripping over this problem - the system
would be crashed, the kdump kernel and environment would boot and
dump the kernel core image, and then the system would reboot. After
reboot, the root filesystem was triggering log recovery and
corruptions were being detected. The metadumps indicated the above
log recovery issue.
What is happening is that the kdump kernel and environment is
mounting the root device read-only to find the binaries needed to do
it's work. The result of this is that it is running log recovery.
However, because there were unlinked files and EFIs to be processed
by recovery, the completion of phase 1 of log recovery could not
mark the log clean. And because it's a read-only mount, the unmount
process does not write records to the log to mark it clean, either.
Hence on the next mount of the filesystem, log recovery was run
again across all the metadata that had already been recovered and
this is what triggered corruption warnings.
To avoid this problem, we need to ensure that a read-only mount
always updates the log when it completes the second phase of
recovery. We already handle this sort of issue with rw->ro remount
transitions, so the solution is as simple as quiescing the
filesystem at the appropriate time during the mount process. This
results in the log being marked clean so the mount behaviour
recorded in the logs on repeated RO mounts will change (i.e. log
recovery will no longer be run on every mount until a RW mount is
done). This is a user visible change in behaviour, but it is
harmless.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-09-26 06:21:44 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Complete the quota initialisation, post-log-replay component.
|
|
|
|
*/
|
2009-06-08 21:33:32 +08:00
|
|
|
if (quotamount) {
|
|
|
|
ASSERT(mp->m_qflags == 0);
|
|
|
|
mp->m_qflags = quotaflags;
|
|
|
|
|
|
|
|
xfs_qm_mount_quotas(mp);
|
|
|
|
}
|
|
|
|
|
2007-06-18 14:50:27 +08:00
|
|
|
/*
|
|
|
|
* Now we are mounted, reserve a small amount of unused space for
|
|
|
|
* privileged transactions. This is needed so that transaction
|
|
|
|
* space required for critical operations can dip into this pool
|
|
|
|
* when at ENOSPC. This is needed for operations like create with
|
|
|
|
* attr, unwritten extent conversion at ENOSPC, etc. Data allocations
|
|
|
|
* are not allowed to use this reserved space.
|
2010-03-04 09:46:25 +08:00
|
|
|
*
|
|
|
|
* This may drive us straight to ENOSPC on mount, but that implies
|
|
|
|
* we were already there on the last unmount. Warn if this occurs.
|
2007-06-18 14:50:27 +08:00
|
|
|
*/
|
2021-08-19 09:46:52 +08:00
|
|
|
if (!xfs_is_readonly(mp)) {
|
2023-12-05 01:40:56 +08:00
|
|
|
error = xfs_reserve_blocks(mp, xfs_default_resblks(mp));
|
2010-02-06 06:59:53 +08:00
|
|
|
if (error)
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_warn(mp,
|
|
|
|
"Unable to allocate reserve blocks. Continuing without reserve pool.");
|
2016-10-04 00:11:39 +08:00
|
|
|
|
2016-10-04 00:11:44 +08:00
|
|
|
/* Reserve AG blocks for future btree expansion. */
|
|
|
|
error = xfs_fs_reserve_ag_blocks(mp);
|
|
|
|
if (error && error != -ENOSPC)
|
|
|
|
goto out_agresv;
|
2010-02-06 06:59:53 +08:00
|
|
|
}
|
2007-06-18 14:50:27 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
return 0;
|
|
|
|
|
2016-10-04 00:11:44 +08:00
|
|
|
out_agresv:
|
|
|
|
xfs_fs_unreserve_ag_blocks(mp);
|
2016-10-04 00:11:39 +08:00
|
|
|
xfs_qm_unmount_quotas(mp);
|
2009-02-04 16:33:58 +08:00
|
|
|
out_rtunmount:
|
|
|
|
xfs_rtunmount_inodes(mp);
|
2009-02-04 16:31:52 +08:00
|
|
|
out_rele_rip:
|
2018-07-26 03:52:32 +08:00
|
|
|
xfs_irele(rip);
|
2017-08-11 05:20:29 +08:00
|
|
|
/* Clean out dquots that might be in memory after quotacheck. */
|
|
|
|
xfs_qm_unmount(mp);
|
2021-08-07 02:05:39 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Inactivate all inodes that might still be in memory after a log
|
|
|
|
* intent recovery failure so that reclaim can free them. Metadata
|
|
|
|
* inodes and the root directory shouldn't need inactivation, but the
|
|
|
|
* mount failed for some reason, so pull down all the state and flee.
|
|
|
|
*/
|
|
|
|
xfs_inodegc_flush(mp);
|
|
|
|
|
2017-11-09 08:26:49 +08:00
|
|
|
/*
|
2021-03-03 01:32:53 +08:00
|
|
|
* Flush all inode reclamation work and flush the log.
|
2017-11-09 08:26:49 +08:00
|
|
|
* We have to do this /after/ rtunmount and qm_unmount because those
|
|
|
|
* two will have scheduled delayed reclaim for the rt/quota inodes.
|
|
|
|
*
|
|
|
|
* This is slightly different from the unmountfs call sequence
|
|
|
|
* because we could be tearing down a partially set up mount. In
|
|
|
|
* particular, if log_mount_finish fails we bail out without calling
|
|
|
|
* qm_unmount_quotas and therefore rely on qm_unmount to release the
|
|
|
|
* quota inodes.
|
|
|
|
*/
|
2021-03-03 01:32:53 +08:00
|
|
|
xfs_unmount_flush_inodes(mp);
|
2009-02-04 16:31:52 +08:00
|
|
|
out_log_dealloc:
|
2015-08-19 07:58:36 +08:00
|
|
|
xfs_log_mount_cancel(mp);
|
2021-08-07 02:05:43 +08:00
|
|
|
out_inodegc_shrinker:
|
2023-09-11 17:44:34 +08:00
|
|
|
shrinker_free(mp->m_inodegc_shrinker);
|
2012-04-23 13:59:06 +08:00
|
|
|
out_fail_wait:
|
|
|
|
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
|
2021-01-23 08:48:19 +08:00
|
|
|
xfs_buftarg_drain(mp->m_logdev_targp);
|
|
|
|
xfs_buftarg_drain(mp->m_ddev_targp);
|
2009-02-04 16:31:52 +08:00
|
|
|
out_free_perag:
|
2008-08-13 14:50:47 +08:00
|
|
|
xfs_free_perag(mp);
|
2014-06-06 13:01:58 +08:00
|
|
|
out_free_dir:
|
|
|
|
xfs_da_unmount(mp);
|
2009-02-04 16:31:52 +08:00
|
|
|
out_remove_uuid:
|
2009-03-30 16:21:31 +08:00
|
|
|
xfs_uuid_unmount(mp);
|
2017-06-21 08:54:46 +08:00
|
|
|
out_remove_errortag:
|
|
|
|
xfs_errortag_del(mp);
|
2016-05-18 08:58:51 +08:00
|
|
|
out_remove_error_sysfs:
|
|
|
|
xfs_error_sysfs_del(mp);
|
2023-08-10 22:48:07 +08:00
|
|
|
out_remove_scrub_stats:
|
|
|
|
xchk_stats_unregister(mp->m_scrub_stats);
|
2015-10-12 15:21:19 +08:00
|
|
|
xfs_sysfs_del(&mp->m_stats.xs_kobj);
|
2014-07-15 06:07:01 +08:00
|
|
|
out_remove_sysfs:
|
|
|
|
xfs_sysfs_del(&mp->m_kobj);
|
2009-02-04 16:31:52 +08:00
|
|
|
out:
|
2005-04-17 06:20:36 +08:00
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This flushes out the inodes,dquots and the superblock, unmounts the
|
|
|
|
* log and makes sure that incore structures are freed.
|
|
|
|
*/
|
2008-08-13 14:49:57 +08:00
|
|
|
void
|
|
|
|
xfs_unmountfs(
|
|
|
|
struct xfs_mount *mp)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2008-08-13 14:49:57 +08:00
|
|
|
int error;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2021-08-07 02:05:39 +08:00
|
|
|
/*
|
|
|
|
* Perform all on-disk metadata updates required to inactivate inodes
|
|
|
|
* that the VFS evicted earlier in the unmount process. Freeing inodes
|
|
|
|
* and discarding CoW fork preallocations can cause shape changes to
|
|
|
|
* the free inode and refcount btrees, respectively, so we must finish
|
|
|
|
* this before we discard the metadata space reservations. Metadata
|
|
|
|
* inodes and the root directory do not require inactivation.
|
|
|
|
*/
|
|
|
|
xfs_inodegc_flush(mp);
|
|
|
|
|
2021-01-23 08:48:44 +08:00
|
|
|
xfs_blockgc_stop(mp);
|
2016-10-04 00:11:44 +08:00
|
|
|
xfs_fs_unreserve_ag_blocks(mp);
|
2009-06-08 21:33:32 +08:00
|
|
|
xfs_qm_unmount_quotas(mp);
|
2009-02-04 16:33:58 +08:00
|
|
|
xfs_rtunmount_inodes(mp);
|
2018-07-26 03:52:32 +08:00
|
|
|
xfs_irele(mp->m_rootip);
|
2008-08-13 14:49:04 +08:00
|
|
|
|
2021-03-03 01:32:53 +08:00
|
|
|
xfs_unmount_flush_inodes(mp);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-06-08 21:33:32 +08:00
|
|
|
xfs_qm_unmount(mp);
|
2008-10-30 13:53:25 +08:00
|
|
|
|
2007-06-18 14:50:27 +08:00
|
|
|
/*
|
|
|
|
* Unreserve any blocks we have so that when we unmount we don't account
|
|
|
|
* the reserved free space as used. This is really only necessary for
|
|
|
|
* lazy superblock counting because it trusts the incore superblock
|
2009-03-29 15:55:42 +08:00
|
|
|
* counters to be absolutely correct on clean unmount.
|
2007-06-18 14:50:27 +08:00
|
|
|
*
|
|
|
|
* We don't bother correcting this elsewhere for lazy superblock
|
|
|
|
* counting because on mount of an unclean filesystem we reconstruct the
|
|
|
|
* correct counter value and this is irrelevant.
|
|
|
|
*
|
|
|
|
* For non-lazy counter filesystems, this doesn't matter at all because
|
|
|
|
* we only every apply deltas to the superblock and hence the incore
|
|
|
|
* value does not matter....
|
|
|
|
*/
|
2023-12-05 01:40:56 +08:00
|
|
|
error = xfs_reserve_blocks(mp, 0);
|
2008-04-10 10:20:03 +08:00
|
|
|
if (error)
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_warn(mp, "Unable to free reserved block pool. "
|
2008-04-10 10:20:03 +08:00
|
|
|
"Freespace may not be correct on next mount.");
|
xfs: fix sb write verify for lazysbcount
When lazysbcount is enabled, fsstress and loop mount/unmount test report
the following problems:
XFS (loop0): SB summary counter sanity check failed
XFS (loop0): Metadata corruption detected at xfs_sb_write_verify+0x13b/0x460,
xfs_sb block 0x0
XFS (loop0): Unmount and run xfs_repair
XFS (loop0): First 128 bytes of corrupted metadata buffer:
00000000: 58 46 53 42 00 00 10 00 00 00 00 00 00 28 00 00 XFSB.........(..
00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
00000020: 69 fb 7c cd 5f dc 44 af 85 74 e0 cc d4 e3 34 5a i.|._.D..t....4Z
00000030: 00 00 00 00 00 20 00 06 00 00 00 00 00 00 00 80 ..... ..........
00000040: 00 00 00 00 00 00 00 81 00 00 00 00 00 00 00 82 ................
00000050: 00 00 00 01 00 0a 00 00 00 00 00 04 00 00 00 00 ................
00000060: 00 00 0a 00 b4 b5 02 00 02 00 00 08 00 00 00 00 ................
00000070: 00 00 00 00 00 00 00 00 0c 09 09 03 14 00 00 19 ................
XFS (loop0): Corruption of in-memory data (0x8) detected at _xfs_buf_ioapply
+0xe1e/0x10e0 (fs/xfs/xfs_buf.c:1580). Shutting down filesystem.
XFS (loop0): Please unmount the filesystem and rectify the problem(s)
XFS (loop0): log mount/recovery failed: error -117
XFS (loop0): log mount failed
This corruption will shutdown the file system and the file system will
no longer be mountable. The following script can reproduce the problem,
but it may take a long time.
#!/bin/bash
device=/dev/sda
testdir=/mnt/test
round=0
function fail()
{
echo "$*"
exit 1
}
mkdir -p $testdir
while [ $round -lt 10000 ]
do
echo "******* round $round ********"
mkfs.xfs -f $device
mount $device $testdir || fail "mount failed!"
fsstress -d $testdir -l 0 -n 10000 -p 4 >/dev/null &
sleep 4
killall -w fsstress
umount $testdir
xfs_repair -e $device > /dev/null
if [ $? -eq 2 ];then
echo "ERR CODE 2: Dirty log exception during repair."
exit 1
fi
round=$(($round+1))
done
With lazysbcount is enabled, There is no additional lock protection for
reading m_ifree and m_icount in xfs_log_sb(), if other cpu modifies the
m_ifree, this will make the m_ifree greater than m_icount. For example,
consider the following sequence and ifreedelta is postive:
CPU0 CPU1
xfs_log_sb xfs_trans_unreserve_and_mod_sb
---------- ------------------------------
percpu_counter_sum(&mp->m_icount)
percpu_counter_add_batch(&mp->m_icount,
idelta, XFS_ICOUNT_BATCH)
percpu_counter_add(&mp->m_ifree, ifreedelta);
percpu_counter_sum(&mp->m_ifree)
After this, incorrect inode count (sb_ifree > sb_icount) will be writen to
the log. In the subsequent writing of sb, incorrect inode count (sb_ifree >
sb_icount) will fail to pass the boundary check in xfs_validate_sb_write()
that cause the file system shutdown.
When lazysbcount is enabled, we don't need to guarantee that Lazy sb
counters are completely correct, but we do need to guarantee that sb_ifree
<= sb_icount. On the other hand, the constraint that m_ifree <= m_icount
must be satisfied any time that there /cannot/ be other threads allocating
or freeing inode chunks. If the constraint is violated under these
circumstances, sb_i{count,free} (the ondisk superblock inode counters)
maybe incorrect and need to be marked sick at unmount, the count will
be rebuilt on the next mount.
Fixes: 8756a5af1819 ("libxfs: add more bounds checking to sb sanity checks")
Signed-off-by: Long Li <leo.lilong@huawei.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2022-11-17 11:20:20 +08:00
|
|
|
xfs_unmount_check(mp);
|
2008-04-10 10:20:03 +08:00
|
|
|
|
xfs: only clear log incompat flags at clean unmount
While reviewing the online fsck patchset, someone spied the
xfs_swapext_can_use_without_log_assistance function and wondered why we
go through this inverted-bitmask dance to avoid setting the
XFS_SB_FEAT_INCOMPAT_LOG_SWAPEXT feature.
(The same principles apply to the logged extended attribute update
feature bit in the since-merged LARP series.)
The reason for this dance is that xfs_add_incompat_log_feature is an
expensive operation -- it forces the log, pushes the AIL, and then if
nobody's beaten us to it, sets the feature bit and issues a synchronous
write of the primary superblock. That could be a one-time cost
amortized over the life of the filesystem, but the log quiesce and cover
operations call xfs_clear_incompat_log_features to remove feature bits
opportunistically. On a moderately loaded filesystem this leads to us
cycling those bits on and off over and over, which hurts performance.
Why do we clear the log incompat bits? Back in ~2020 I think Dave and I
had a conversation on IRC[2] about what the log incompat bits represent.
IIRC in that conversation we decided that the log incompat bits protect
unrecovered log items so that old kernels won't try to recover them and
barf. Since a clean log has no protected log items, we could clear the
bits at cover/quiesce time.
As Dave Chinner pointed out in the thread, clearing log incompat bits at
unmount time has positive effects for golden root disk image generator
setups, since the generator could be running a newer kernel than what
gets written to the golden image -- if there are log incompat fields set
in the golden image that was generated by a newer kernel/OS image
builder then the provisioning host cannot mount the filesystem even
though the log is clean and recovery is unnecessary to mount the
filesystem.
Given that it's expensive to set log incompat bits, we really only want
to do that once per bit per mount. Therefore, I propose that we only
clear log incompat bits as part of writing a clean unmount record. Do
this by adding an operational state flag to the xfs mount that guards
whether or not the feature bit clearing can actually take place.
This eliminates the l_incompat_users rwsem that we use to protect a log
cleaning operation from clearing a feature bit that a frontend thread is
trying to set -- this lock adds another way to fail w.r.t. locking. For
the swapext series, I shard that into multiple locks just to work around
the lockdep complaints, and that's fugly.
Link: https://lore.kernel.org/linux-xfs/20240131230043.GA6180@frogsfrogsfrogs/
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
2024-04-16 05:54:06 +08:00
|
|
|
/*
|
|
|
|
* Indicate that it's ok to clear log incompat bits before cleaning
|
|
|
|
* the log and writing the unmount record.
|
|
|
|
*/
|
|
|
|
xfs_set_done_with_log_incompat(mp);
|
2009-03-16 15:19:29 +08:00
|
|
|
xfs_log_unmount(mp);
|
2014-06-06 13:01:58 +08:00
|
|
|
xfs_da_unmount(mp);
|
2009-03-30 16:21:31 +08:00
|
|
|
xfs_uuid_unmount(mp);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-08-13 14:17:37 +08:00
|
|
|
#if defined(DEBUG)
|
2017-06-21 08:54:46 +08:00
|
|
|
xfs_errortag_clearall(mp);
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
2023-09-11 17:44:34 +08:00
|
|
|
shrinker_free(mp->m_inodegc_shrinker);
|
2008-08-13 14:50:47 +08:00
|
|
|
xfs_free_perag(mp);
|
2014-07-15 06:07:01 +08:00
|
|
|
|
2017-06-21 08:54:46 +08:00
|
|
|
xfs_errortag_del(mp);
|
2016-05-18 08:58:51 +08:00
|
|
|
xfs_error_sysfs_del(mp);
|
2023-08-10 22:48:07 +08:00
|
|
|
xchk_stats_unregister(mp->m_scrub_stats);
|
2015-10-12 15:21:19 +08:00
|
|
|
xfs_sysfs_del(&mp->m_stats.xs_kobj);
|
2014-07-15 06:07:01 +08:00
|
|
|
xfs_sysfs_del(&mp->m_kobj);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2014-11-28 11:02:59 +08:00
|
|
|
/*
|
|
|
|
* Determine whether modifications can proceed. The caller specifies the minimum
|
|
|
|
* freeze level for which modifications should not be allowed. This allows
|
|
|
|
* certain operations to proceed while the freeze sequence is in progress, if
|
|
|
|
* necessary.
|
|
|
|
*/
|
|
|
|
bool
|
|
|
|
xfs_fs_writable(
|
|
|
|
struct xfs_mount *mp,
|
|
|
|
int level)
|
[XFS] Lazy Superblock Counters
When we have a couple of hundred transactions on the fly at once, they all
typically modify the on disk superblock in some way.
create/unclink/mkdir/rmdir modify inode counts, allocation/freeing modify
free block counts.
When these counts are modified in a transaction, they must eventually lock
the superblock buffer and apply the mods. The buffer then remains locked
until the transaction is committed into the incore log buffer. The result
of this is that with enough transactions on the fly the incore superblock
buffer becomes a bottleneck.
The result of contention on the incore superblock buffer is that
transaction rates fall - the more pressure that is put on the superblock
buffer, the slower things go.
The key to removing the contention is to not require the superblock fields
in question to be locked. We do that by not marking the superblock dirty
in the transaction. IOWs, we modify the incore superblock but do not
modify the cached superblock buffer. In short, we do not log superblock
modifications to critical fields in the superblock on every transaction.
In fact we only do it just before we write the superblock to disk every
sync period or just before unmount.
This creates an interesting problem - if we don't log or write out the
fields in every transaction, then how do the values get recovered after a
crash? the answer is simple - we keep enough duplicate, logged information
in other structures that we can reconstruct the correct count after log
recovery has been performed.
It is the AGF and AGI structures that contain the duplicate information;
after recovery, we walk every AGI and AGF and sum their individual
counters to get the correct value, and we do a transaction into the log to
correct them. An optimisation of this is that if we have a clean unmount
record, we know the value in the superblock is correct, so we can avoid
the summation walk under normal conditions and so mount/recovery times do
not change under normal operation.
One wrinkle that was discovered during development was that the blocks
used in the freespace btrees are never accounted for in the AGF counters.
This was once a valid optimisation to make; when the filesystem is full,
the free space btrees are empty and consume no space. Hence when it
matters, the "accounting" is correct. But that means the when we do the
AGF summations, we would not have a correct count and xfs_check would
complain. Hence a new counter was added to track the number of blocks used
by the free space btrees. This is an *on-disk format change*.
As a result of this, lazy superblock counters are a mkfs option and at the
moment on linux there is no way to convert an old filesystem. This is
possible - xfs_db can be used to twiddle the right bits and then
xfs_repair will do the format conversion for you. Similarly, you can
convert backwards as well. At some point we'll add functionality to
xfs_admin to do the bit twiddling easily....
SGI-PV: 964999
SGI-Modid: xfs-linux-melb:xfs-kern:28652a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tim Shimmin <tes@sgi.com>
2007-05-24 13:26:31 +08:00
|
|
|
{
|
2014-11-28 11:02:59 +08:00
|
|
|
ASSERT(level > SB_UNFROZEN);
|
|
|
|
if ((mp->m_super->s_writers.frozen >= level) ||
|
2021-08-19 09:46:53 +08:00
|
|
|
xfs_is_shutdown(mp) || xfs_is_readonly(mp))
|
2014-11-28 11:02:59 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
[XFS] Lazy Superblock Counters
When we have a couple of hundred transactions on the fly at once, they all
typically modify the on disk superblock in some way.
create/unclink/mkdir/rmdir modify inode counts, allocation/freeing modify
free block counts.
When these counts are modified in a transaction, they must eventually lock
the superblock buffer and apply the mods. The buffer then remains locked
until the transaction is committed into the incore log buffer. The result
of this is that with enough transactions on the fly the incore superblock
buffer becomes a bottleneck.
The result of contention on the incore superblock buffer is that
transaction rates fall - the more pressure that is put on the superblock
buffer, the slower things go.
The key to removing the contention is to not require the superblock fields
in question to be locked. We do that by not marking the superblock dirty
in the transaction. IOWs, we modify the incore superblock but do not
modify the cached superblock buffer. In short, we do not log superblock
modifications to critical fields in the superblock on every transaction.
In fact we only do it just before we write the superblock to disk every
sync period or just before unmount.
This creates an interesting problem - if we don't log or write out the
fields in every transaction, then how do the values get recovered after a
crash? the answer is simple - we keep enough duplicate, logged information
in other structures that we can reconstruct the correct count after log
recovery has been performed.
It is the AGF and AGI structures that contain the duplicate information;
after recovery, we walk every AGI and AGF and sum their individual
counters to get the correct value, and we do a transaction into the log to
correct them. An optimisation of this is that if we have a clean unmount
record, we know the value in the superblock is correct, so we can avoid
the summation walk under normal conditions and so mount/recovery times do
not change under normal operation.
One wrinkle that was discovered during development was that the blocks
used in the freespace btrees are never accounted for in the AGF counters.
This was once a valid optimisation to make; when the filesystem is full,
the free space btrees are empty and consume no space. Hence when it
matters, the "accounting" is correct. But that means the when we do the
AGF summations, we would not have a correct count and xfs_check would
complain. Hence a new counter was added to track the number of blocks used
by the free space btrees. This is an *on-disk format change*.
As a result of this, lazy superblock counters are a mkfs option and at the
moment on linux there is no way to convert an old filesystem. This is
possible - xfs_db can be used to twiddle the right bits and then
xfs_repair will do the format conversion for you. Similarly, you can
convert backwards as well. At some point we'll add functionality to
xfs_admin to do the bit twiddling easily....
SGI-PV: 964999
SGI-Modid: xfs-linux-melb:xfs-kern:28652a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tim Shimmin <tes@sgi.com>
2007-05-24 13:26:31 +08:00
|
|
|
}
|
|
|
|
|
2024-04-22 19:20:12 +08:00
|
|
|
void
|
|
|
|
xfs_add_freecounter(
|
|
|
|
struct xfs_mount *mp,
|
|
|
|
struct percpu_counter *counter,
|
|
|
|
uint64_t delta)
|
|
|
|
{
|
|
|
|
bool has_resv_pool = (counter == &mp->m_fdblocks);
|
|
|
|
uint64_t res_used;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the reserve pool is depleted, put blocks back into it first.
|
|
|
|
* Most of the time the pool is full.
|
|
|
|
*/
|
|
|
|
if (!has_resv_pool || mp->m_resblks == mp->m_resblks_avail) {
|
|
|
|
percpu_counter_add(counter, delta);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
spin_lock(&mp->m_sb_lock);
|
|
|
|
res_used = mp->m_resblks - mp->m_resblks_avail;
|
|
|
|
if (res_used > delta) {
|
|
|
|
mp->m_resblks_avail += delta;
|
|
|
|
} else {
|
|
|
|
delta -= res_used;
|
|
|
|
mp->m_resblks_avail = mp->m_resblks;
|
|
|
|
percpu_counter_add(counter, delta);
|
|
|
|
}
|
|
|
|
spin_unlock(&mp->m_sb_lock);
|
|
|
|
}
|
|
|
|
|
2015-02-23 18:22:03 +08:00
|
|
|
int
|
2024-04-22 19:20:12 +08:00
|
|
|
xfs_dec_freecounter(
|
2015-02-23 18:22:03 +08:00
|
|
|
struct xfs_mount *mp,
|
2022-04-12 04:49:42 +08:00
|
|
|
struct percpu_counter *counter,
|
2024-04-22 19:20:12 +08:00
|
|
|
uint64_t delta,
|
2015-02-23 18:22:03 +08:00
|
|
|
bool rsvd)
|
|
|
|
{
|
|
|
|
int64_t lcounter;
|
2022-04-12 04:49:42 +08:00
|
|
|
uint64_t set_aside = 0;
|
2015-02-23 18:22:03 +08:00
|
|
|
s32 batch;
|
2022-04-12 04:49:42 +08:00
|
|
|
bool has_resv_pool;
|
|
|
|
|
|
|
|
ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents);
|
|
|
|
has_resv_pool = (counter == &mp->m_fdblocks);
|
|
|
|
if (rsvd)
|
|
|
|
ASSERT(has_resv_pool);
|
2015-02-23 18:22:03 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Taking blocks away, need to be more accurate the closer we
|
|
|
|
* are to zero.
|
|
|
|
*
|
|
|
|
* If the counter has a value of less than 2 * max batch size,
|
|
|
|
* then make everything serialise as we are real close to
|
|
|
|
* ENOSPC.
|
|
|
|
*/
|
2022-04-12 04:49:42 +08:00
|
|
|
if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH,
|
2015-05-29 05:39:34 +08:00
|
|
|
XFS_FDBLOCKS_BATCH) < 0)
|
2015-02-23 18:22:03 +08:00
|
|
|
batch = 1;
|
|
|
|
else
|
2015-05-29 05:39:34 +08:00
|
|
|
batch = XFS_FDBLOCKS_BATCH;
|
2015-02-23 18:22:03 +08:00
|
|
|
|
xfs: set aside allocation btree blocks from block reservation
The blocks used for allocation btrees (bnobt and countbt) are
technically considered free space. This is because as free space is
used, allocbt blocks are removed and naturally become available for
traditional allocation. However, this means that a significant
portion of free space may consist of in-use btree blocks if free
space is severely fragmented.
On large filesystems with large perag reservations, this can lead to
a rare but nasty condition where a significant amount of physical
free space is available, but the majority of actual usable blocks
consist of in-use allocbt blocks. We have a record of a (~12TB, 32
AG) filesystem with multiple AGs in a state with ~2.5GB or so free
blocks tracked across ~300 total allocbt blocks, but effectively at
100% full because the the free space is entirely consumed by
refcountbt perag reservation.
Such a large perag reservation is by design on large filesystems.
The problem is that because the free space is so fragmented, this AG
contributes the 300 or so allocbt blocks to the global counters as
free space. If this pattern repeats across enough AGs, the
filesystem lands in a state where global block reservation can
outrun physical block availability. For example, a streaming
buffered write on the affected filesystem continues to allow delayed
allocation beyond the point where writeback starts to fail due to
physical block allocation failures. The expected behavior is for the
delalloc block reservation to fail gracefully with -ENOSPC before
physical block allocation failure is a possibility.
To address this problem, set aside in-use allocbt blocks at
reservation time and thus ensure they cannot be reserved until truly
available for physical allocation. This allows alloc btree metadata
to continue to reside in free space, but dynamically adjusts
reservation availability based on internal state. Note that the
logic requires that the allocbt counter is fully populated at
reservation time before it is fully effective. We currently rely on
the mount time AGF scan in the perag reservation initialization code
for this dependency on filesystems where it's most important (i.e.
with active perag reservations).
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Chandan Babu R <chandanrlinux@gmail.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2021-04-29 06:06:05 +08:00
|
|
|
/*
|
|
|
|
* Set aside allocbt blocks because these blocks are tracked as free
|
|
|
|
* space but not available for allocation. Technically this means that a
|
|
|
|
* single reservation cannot consume all remaining free space, but the
|
|
|
|
* ratio of allocbt blocks to usable free blocks should be rather small.
|
|
|
|
* The tradeoff without this is that filesystems that maintain high
|
|
|
|
* perag block reservations can over reserve physical block availability
|
|
|
|
* and fail physical allocation, which leads to much more serious
|
|
|
|
* problems (i.e. transaction abort, pagecache discards, etc.) than
|
|
|
|
* slightly premature -ENOSPC.
|
|
|
|
*/
|
2022-04-12 04:49:42 +08:00
|
|
|
if (has_resv_pool)
|
|
|
|
set_aside = xfs_fdblocks_unavailable(mp);
|
2024-04-22 19:20:12 +08:00
|
|
|
percpu_counter_add_batch(counter, -((int64_t)delta), batch);
|
2022-04-12 04:49:42 +08:00
|
|
|
if (__percpu_counter_compare(counter, set_aside,
|
2015-05-29 05:39:34 +08:00
|
|
|
XFS_FDBLOCKS_BATCH) >= 0) {
|
2015-02-23 18:22:03 +08:00
|
|
|
/* we had space! */
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* lock up the sb for dipping into reserves before releasing the space
|
|
|
|
* that took us to ENOSPC.
|
|
|
|
*/
|
|
|
|
spin_lock(&mp->m_sb_lock);
|
2024-04-22 19:20:12 +08:00
|
|
|
percpu_counter_add(counter, delta);
|
2022-04-12 04:49:42 +08:00
|
|
|
if (!has_resv_pool || !rsvd)
|
2015-02-23 18:22:03 +08:00
|
|
|
goto fdblocks_enospc;
|
|
|
|
|
2024-04-22 19:20:12 +08:00
|
|
|
lcounter = (long long)mp->m_resblks_avail - delta;
|
2015-02-23 18:22:03 +08:00
|
|
|
if (lcounter >= 0) {
|
|
|
|
mp->m_resblks_avail = lcounter;
|
|
|
|
spin_unlock(&mp->m_sb_lock);
|
|
|
|
return 0;
|
|
|
|
}
|
2020-04-28 02:00:42 +08:00
|
|
|
xfs_warn_once(mp,
|
|
|
|
"Reserve blocks depleted! Consider increasing reserve pool size.");
|
|
|
|
|
2015-02-23 18:22:03 +08:00
|
|
|
fdblocks_enospc:
|
|
|
|
spin_unlock(&mp->m_sb_lock);
|
|
|
|
return -ENOSPC;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Used to free the superblock along various error paths.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
xfs_freesb(
|
2010-09-22 08:47:20 +08:00
|
|
|
struct xfs_mount *mp)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2010-09-22 08:47:20 +08:00
|
|
|
struct xfs_buf *bp = mp->m_sb_bp;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2010-09-22 08:47:20 +08:00
|
|
|
xfs_buf_lock(bp);
|
2005-04-17 06:20:36 +08:00
|
|
|
mp->m_sb_bp = NULL;
|
2010-09-22 08:47:20 +08:00
|
|
|
xfs_buf_relse(bp);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2010-02-15 17:44:46 +08:00
|
|
|
/*
|
|
|
|
* If the underlying (data/log/rt) device is readonly, there are some
|
|
|
|
* operations that cannot proceed.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
xfs_dev_is_read_only(
|
|
|
|
struct xfs_mount *mp,
|
|
|
|
char *message)
|
|
|
|
{
|
|
|
|
if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
|
|
|
|
xfs_readonly_buftarg(mp->m_logdev_targp) ||
|
|
|
|
(mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
|
2011-03-07 07:08:35 +08:00
|
|
|
xfs_notice(mp, "%s required on read-only device.", message);
|
|
|
|
xfs_notice(mp, "write access unavailable, cannot proceed.");
|
2014-06-25 12:58:08 +08:00
|
|
|
return -EROFS;
|
2010-02-15 17:44:46 +08:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
2018-07-21 00:28:40 +08:00
|
|
|
|
|
|
|
/* Force the summary counters to be recalculated at next mount. */
|
|
|
|
void
|
|
|
|
xfs_force_summary_recalc(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
2021-08-19 09:46:37 +08:00
|
|
|
if (!xfs_has_lazysbcount(mp))
|
2018-07-21 00:28:40 +08:00
|
|
|
return;
|
|
|
|
|
2019-04-12 22:41:15 +08:00
|
|
|
xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
|
2018-07-21 00:28:40 +08:00
|
|
|
}
|
2019-04-26 09:26:22 +08:00
|
|
|
|
2021-08-08 23:27:12 +08:00
|
|
|
/*
|
|
|
|
* Enable a log incompat feature flag in the primary superblock. The caller
|
|
|
|
* cannot have any other transactions in progress.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
xfs_add_incompat_log_feature(
|
|
|
|
struct xfs_mount *mp,
|
|
|
|
uint32_t feature)
|
|
|
|
{
|
|
|
|
struct xfs_dsb *dsb;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
ASSERT(hweight32(feature) == 1);
|
|
|
|
ASSERT(!(feature & XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Force the log to disk and kick the background AIL thread to reduce
|
|
|
|
* the chances that the bwrite will stall waiting for the AIL to unpin
|
|
|
|
* the primary superblock buffer. This isn't a data integrity
|
|
|
|
* operation, so we don't need a synchronous push.
|
|
|
|
*/
|
|
|
|
error = xfs_log_force(mp, XFS_LOG_SYNC);
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
xfs_ail_push_all(mp->m_ail);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Lock the primary superblock buffer to serialize all callers that
|
|
|
|
* are trying to set feature bits.
|
|
|
|
*/
|
|
|
|
xfs_buf_lock(mp->m_sb_bp);
|
|
|
|
xfs_buf_hold(mp->m_sb_bp);
|
|
|
|
|
2021-08-19 09:46:53 +08:00
|
|
|
if (xfs_is_shutdown(mp)) {
|
2021-08-08 23:27:12 +08:00
|
|
|
error = -EIO;
|
|
|
|
goto rele;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (xfs_sb_has_incompat_log_feature(&mp->m_sb, feature))
|
|
|
|
goto rele;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Write the primary superblock to disk immediately, because we need
|
|
|
|
* the log_incompat bit to be set in the primary super now to protect
|
|
|
|
* the log items that we're going to commit later.
|
|
|
|
*/
|
|
|
|
dsb = mp->m_sb_bp->b_addr;
|
|
|
|
xfs_sb_to_disk(dsb, &mp->m_sb);
|
|
|
|
dsb->sb_features_log_incompat |= cpu_to_be32(feature);
|
|
|
|
error = xfs_bwrite(mp->m_sb_bp);
|
|
|
|
if (error)
|
|
|
|
goto shutdown;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Add the feature bits to the incore superblock before we unlock the
|
|
|
|
* buffer.
|
|
|
|
*/
|
|
|
|
xfs_sb_add_incompat_log_features(&mp->m_sb, feature);
|
|
|
|
xfs_buf_relse(mp->m_sb_bp);
|
|
|
|
|
|
|
|
/* Log the superblock to disk. */
|
|
|
|
return xfs_sync_sb(mp, false);
|
|
|
|
shutdown:
|
|
|
|
xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
|
|
|
|
rele:
|
|
|
|
xfs_buf_relse(mp->m_sb_bp);
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Clear all the log incompat flags from the superblock.
|
|
|
|
*
|
|
|
|
* The caller cannot be in a transaction, must ensure that the log does not
|
|
|
|
* contain any log items protected by any log incompat bit, and must ensure
|
|
|
|
* that there are no other threads that depend on the state of the log incompat
|
|
|
|
* feature flags in the primary super.
|
|
|
|
*
|
|
|
|
* Returns true if the superblock is dirty.
|
|
|
|
*/
|
|
|
|
bool
|
|
|
|
xfs_clear_incompat_log_features(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
|
|
|
bool ret = false;
|
|
|
|
|
2021-08-19 09:46:55 +08:00
|
|
|
if (!xfs_has_crc(mp) ||
|
2021-08-08 23:27:12 +08:00
|
|
|
!xfs_sb_has_incompat_log_feature(&mp->m_sb,
|
|
|
|
XFS_SB_FEAT_INCOMPAT_LOG_ALL) ||
|
xfs: only clear log incompat flags at clean unmount
While reviewing the online fsck patchset, someone spied the
xfs_swapext_can_use_without_log_assistance function and wondered why we
go through this inverted-bitmask dance to avoid setting the
XFS_SB_FEAT_INCOMPAT_LOG_SWAPEXT feature.
(The same principles apply to the logged extended attribute update
feature bit in the since-merged LARP series.)
The reason for this dance is that xfs_add_incompat_log_feature is an
expensive operation -- it forces the log, pushes the AIL, and then if
nobody's beaten us to it, sets the feature bit and issues a synchronous
write of the primary superblock. That could be a one-time cost
amortized over the life of the filesystem, but the log quiesce and cover
operations call xfs_clear_incompat_log_features to remove feature bits
opportunistically. On a moderately loaded filesystem this leads to us
cycling those bits on and off over and over, which hurts performance.
Why do we clear the log incompat bits? Back in ~2020 I think Dave and I
had a conversation on IRC[2] about what the log incompat bits represent.
IIRC in that conversation we decided that the log incompat bits protect
unrecovered log items so that old kernels won't try to recover them and
barf. Since a clean log has no protected log items, we could clear the
bits at cover/quiesce time.
As Dave Chinner pointed out in the thread, clearing log incompat bits at
unmount time has positive effects for golden root disk image generator
setups, since the generator could be running a newer kernel than what
gets written to the golden image -- if there are log incompat fields set
in the golden image that was generated by a newer kernel/OS image
builder then the provisioning host cannot mount the filesystem even
though the log is clean and recovery is unnecessary to mount the
filesystem.
Given that it's expensive to set log incompat bits, we really only want
to do that once per bit per mount. Therefore, I propose that we only
clear log incompat bits as part of writing a clean unmount record. Do
this by adding an operational state flag to the xfs mount that guards
whether or not the feature bit clearing can actually take place.
This eliminates the l_incompat_users rwsem that we use to protect a log
cleaning operation from clearing a feature bit that a frontend thread is
trying to set -- this lock adds another way to fail w.r.t. locking. For
the swapext series, I shard that into multiple locks just to work around
the lockdep complaints, and that's fugly.
Link: https://lore.kernel.org/linux-xfs/20240131230043.GA6180@frogsfrogsfrogs/
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
2024-04-16 05:54:06 +08:00
|
|
|
xfs_is_shutdown(mp) ||
|
|
|
|
!xfs_is_done_with_log_incompat(mp))
|
2021-08-08 23:27:12 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Update the incore superblock. We synchronize on the primary super
|
|
|
|
* buffer lock to be consistent with the add function, though at least
|
|
|
|
* in theory this shouldn't be necessary.
|
|
|
|
*/
|
|
|
|
xfs_buf_lock(mp->m_sb_bp);
|
|
|
|
xfs_buf_hold(mp->m_sb_bp);
|
|
|
|
|
|
|
|
if (xfs_sb_has_incompat_log_feature(&mp->m_sb,
|
|
|
|
XFS_SB_FEAT_INCOMPAT_LOG_ALL)) {
|
|
|
|
xfs_sb_remove_incompat_log_features(&mp->m_sb);
|
|
|
|
ret = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
xfs_buf_relse(mp->m_sb_bp);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2019-04-26 09:26:22 +08:00
|
|
|
/*
|
|
|
|
* Update the in-core delayed block counter.
|
|
|
|
*
|
|
|
|
* We prefer to update the counter without having to take a spinlock for every
|
|
|
|
* counter update (i.e. batching). Each change to delayed allocation
|
|
|
|
* reservations can change can easily exceed the default percpu counter
|
|
|
|
* batching, so we use a larger batch factor here.
|
|
|
|
*
|
|
|
|
* Note that we don't currently have any callers requiring fast summation
|
|
|
|
* (e.g. percpu_counter_read) so we can use a big batch value here.
|
|
|
|
*/
|
|
|
|
#define XFS_DELALLOC_BATCH (4096)
|
|
|
|
void
|
|
|
|
xfs_mod_delalloc(
|
2024-04-22 19:20:15 +08:00
|
|
|
struct xfs_inode *ip,
|
|
|
|
int64_t data_delta,
|
|
|
|
int64_t ind_delta)
|
2019-04-26 09:26:22 +08:00
|
|
|
{
|
2024-04-22 19:20:15 +08:00
|
|
|
struct xfs_mount *mp = ip->i_mount;
|
|
|
|
|
|
|
|
if (XFS_IS_REALTIME_INODE(ip)) {
|
|
|
|
percpu_counter_add_batch(&mp->m_delalloc_rtextents,
|
|
|
|
xfs_rtb_to_rtx(mp, data_delta),
|
|
|
|
XFS_DELALLOC_BATCH);
|
|
|
|
if (!ind_delta)
|
|
|
|
return;
|
|
|
|
data_delta = 0;
|
|
|
|
}
|
|
|
|
percpu_counter_add_batch(&mp->m_delalloc_blks, data_delta + ind_delta,
|
2019-04-26 09:26:22 +08:00
|
|
|
XFS_DELALLOC_BATCH);
|
|
|
|
}
|