2019-09-23 22:05:19 +08:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0 */
|
|
|
|
|
|
|
|
#ifndef BTRFS_EXTENT_IO_TREE_H
|
|
|
|
#define BTRFS_EXTENT_IO_TREE_H
|
|
|
|
|
|
|
|
struct extent_changeset;
|
2019-09-23 22:05:21 +08:00
|
|
|
struct io_failure_record;
|
2019-09-23 22:05:19 +08:00
|
|
|
|
|
|
|
/* Bits for the extent state */
|
|
|
|
#define EXTENT_DIRTY (1U << 0)
|
|
|
|
#define EXTENT_UPTODATE (1U << 1)
|
|
|
|
#define EXTENT_LOCKED (1U << 2)
|
|
|
|
#define EXTENT_NEW (1U << 3)
|
|
|
|
#define EXTENT_DELALLOC (1U << 4)
|
|
|
|
#define EXTENT_DEFRAG (1U << 5)
|
|
|
|
#define EXTENT_BOUNDARY (1U << 6)
|
|
|
|
#define EXTENT_NODATASUM (1U << 7)
|
|
|
|
#define EXTENT_CLEAR_META_RESV (1U << 8)
|
|
|
|
#define EXTENT_NEED_WAIT (1U << 9)
|
|
|
|
#define EXTENT_NORESERVE (1U << 11)
|
|
|
|
#define EXTENT_QGROUP_RESERVED (1U << 12)
|
|
|
|
#define EXTENT_CLEAR_DATA_RESV (1U << 13)
|
btrfs: update the number of bytes used by an inode atomically
There are several occasions where we do not update the inode's number of
used bytes atomically, resulting in a concurrent stat(2) syscall to report
a value of used blocks that does not correspond to a valid value, that is,
a value that does not match neither what we had before the operation nor
what we get after the operation completes.
In extreme cases it can result in stat(2) reporting zero used blocks, which
can cause problems for some userspace tools where they can consider a file
with a non-zero size and zero used blocks as completely sparse and skip
reading data, as reported/discussed a long time ago in some threads like
the following:
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
The cases where this can happen are the following:
-> Case 1
If we do a write (buffered or direct IO) against a file region for which
there is already an allocated extent (or multiple extents), then we have a
short time window where we can report a number of used blocks to stat(2)
that does not take into account the file region being overwritten. This
short time window happens when completing the ordered extent(s).
This happens because when we drop the extents in the write range we
decrement the inode's number of bytes and later on when we insert the new
extent(s) we increment the number of bytes in the inode, resulting in a
short time window where a stat(2) syscall can get an incorrect number of
used blocks.
If we do writes that overwrite an entire file, then we have a short time
window where we report 0 used blocks to stat(2).
Example reproducer:
$ cat reproducer-1.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
expected=$(stat -c %b $MNT/foobar)
# Create a process to keep calling stat(2) on the file and see if the
# reported number of blocks used (disk space used) changes, it should
# not because we are not increasing the file size nor punching holes.
stat_loop $MNT/foobar $expected &
loop_pid=$!
for ((i = 0; i < 50000; i++)); do
xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
done
kill $loop_pid &> /dev/null
wait
umount $DEV
$ ./reproducer-1.sh
ERROR: unexpected used blocks (got: 0 expected: 128)
ERROR: unexpected used blocks (got: 0 expected: 128)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 2
If we do a buffered write against a file region that does not have any
allocated extents, like a hole or beyond EOF, then during ordered extent
completion we have a short time window where a concurrent stat(2) syscall
can report a number of used blocks that does not correspond to the value
before or after the write operation, a value that is actually larger than
the value after the write completes.
This happens because once we start a buffered write into an unallocated
file range we increment the inode's 'new_delalloc_bytes', to make sure
any stat(2) call gets a correct used blocks value before delalloc is
flushed and completes. However at ordered extent completion, after we
inserted the new extent, we increment the inode's number of bytes used
with the size of the new extent, and only later, when clearing the range
in the inode's iotree, we decrement the inode's 'new_delalloc_bytes'
counter with the size of the extent. So this results in a short time
window where a concurrent stat(2) syscall can report a number of used
blocks that accounts for the new extent twice.
Example reproducer:
$ cat reproducer-2.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
touch $MNT/foobar
write_size=$((64 * 1024))
for ((i = 0; i < 16384; i++)); do
offset=$(($i * $write_size))
xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null
blocks_used=$(stat -c %b $MNT/foobar)
# Fsync the file to trigger writeback and keep calling stat(2) on it
# to see if the number of blocks used changes.
stat_loop $MNT/foobar $blocks_used &
loop_pid=$!
xfs_io -c "fsync" $MNT/foobar
kill $loop_pid &> /dev/null
wait $loop_pid
done
umount $DEV
$ ./reproducer-2.sh
ERROR: unexpected used blocks (got: 265472 expected: 265344)
ERROR: unexpected used blocks (got: 284032 expected: 283904)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 3
Another case where such problems happen is during other operations that
replace extents in a file range with other extents. Those operations are
extent cloning, deduplication and fallocate's zero range operation.
The cause of the problem is similar to the first case. When we drop the
extents from a range, we decrement the inode's number of bytes, and later
on, after inserting the new extents we increment it. Since this is not
done atomically, a concurrent stat(2) call can see and return a number of
used blocks that is smaller than it should be, does not match the number
of used blocks before or after the clone/deduplication/zero operation.
Like for the first case, when doing a clone, deduplication or zero range
operation against an entire file, we end up having a time window where we
can report 0 used blocks to a stat(2) call.
Example reproducer:
$ cat reproducer-3.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f -m reflink=1 $DEV > /dev/null
mount $DEV $MNT
extent_size=$((64 * 1024))
num_extents=16384
file_size=$(($extent_size * $num_extents))
# File foo has many small extents.
xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \
> /dev/null
# File bar has much less extents and has exactly the same data as foo.
xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null
expected=$(stat -c %b $MNT/foo)
# Now deduplicate bar into foo. While the deduplication is in progres,
# the number of used blocks/file size reported by stat should not change
xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null &
dedupe_pid=$!
while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do
used=$(stat -c %b $MNT/foo)
if [ $used -ne $expected ]; then
echo "Unexpected blocks used: $used (expected: $expected)"
fi
done
umount $DEV
$ ./reproducer-3.sh
Unexpected blocks used: 2076800 (expected: 2097152)
Unexpected blocks used: 2097024 (expected: 2097152)
Unexpected blocks used: 2079872 (expected: 2097152)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
So fix this by:
1) Making btrfs_drop_extents() not decrement the VFS inode's number of
bytes, and instead return the number of bytes;
2) Making any code that drops extents and adds new extents update the
inode's number of bytes atomically, while holding the btrfs inode's
spinlock, which is also used by the stat(2) callback to get the inode's
number of bytes;
3) For ranges in the inode's iotree that are marked as 'delalloc new',
corresponding to previously unallocated ranges, increment the inode's
number of bytes when clearing the 'delalloc new' bit from the range,
in the same critical section that decrements the inode's
'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock.
An alternative would be to have btrfs_getattr() wait for any IO (ordered
extents in progress) and locking the whole range (0 to (u64)-1) while it
it computes the number of blocks used. But that would mean blocking
stat(2), which is a very used syscall and expected to be fast, waiting
for writes, clone/dedupe, fallocate, page reads, fiemap, etc.
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 19:07:34 +08:00
|
|
|
/*
|
|
|
|
* Must be cleared only during ordered extent completion or on error paths if we
|
|
|
|
* did not manage to submit bios and create the ordered extents for the range.
|
|
|
|
* Should not be cleared during page release and page invalidation (if there is
|
|
|
|
* an ordered extent in flight), that is left for the ordered extent completion.
|
|
|
|
*/
|
2019-09-23 22:05:19 +08:00
|
|
|
#define EXTENT_DELALLOC_NEW (1U << 14)
|
btrfs: update the number of bytes used by an inode atomically
There are several occasions where we do not update the inode's number of
used bytes atomically, resulting in a concurrent stat(2) syscall to report
a value of used blocks that does not correspond to a valid value, that is,
a value that does not match neither what we had before the operation nor
what we get after the operation completes.
In extreme cases it can result in stat(2) reporting zero used blocks, which
can cause problems for some userspace tools where they can consider a file
with a non-zero size and zero used blocks as completely sparse and skip
reading data, as reported/discussed a long time ago in some threads like
the following:
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
The cases where this can happen are the following:
-> Case 1
If we do a write (buffered or direct IO) against a file region for which
there is already an allocated extent (or multiple extents), then we have a
short time window where we can report a number of used blocks to stat(2)
that does not take into account the file region being overwritten. This
short time window happens when completing the ordered extent(s).
This happens because when we drop the extents in the write range we
decrement the inode's number of bytes and later on when we insert the new
extent(s) we increment the number of bytes in the inode, resulting in a
short time window where a stat(2) syscall can get an incorrect number of
used blocks.
If we do writes that overwrite an entire file, then we have a short time
window where we report 0 used blocks to stat(2).
Example reproducer:
$ cat reproducer-1.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
expected=$(stat -c %b $MNT/foobar)
# Create a process to keep calling stat(2) on the file and see if the
# reported number of blocks used (disk space used) changes, it should
# not because we are not increasing the file size nor punching holes.
stat_loop $MNT/foobar $expected &
loop_pid=$!
for ((i = 0; i < 50000; i++)); do
xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
done
kill $loop_pid &> /dev/null
wait
umount $DEV
$ ./reproducer-1.sh
ERROR: unexpected used blocks (got: 0 expected: 128)
ERROR: unexpected used blocks (got: 0 expected: 128)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 2
If we do a buffered write against a file region that does not have any
allocated extents, like a hole or beyond EOF, then during ordered extent
completion we have a short time window where a concurrent stat(2) syscall
can report a number of used blocks that does not correspond to the value
before or after the write operation, a value that is actually larger than
the value after the write completes.
This happens because once we start a buffered write into an unallocated
file range we increment the inode's 'new_delalloc_bytes', to make sure
any stat(2) call gets a correct used blocks value before delalloc is
flushed and completes. However at ordered extent completion, after we
inserted the new extent, we increment the inode's number of bytes used
with the size of the new extent, and only later, when clearing the range
in the inode's iotree, we decrement the inode's 'new_delalloc_bytes'
counter with the size of the extent. So this results in a short time
window where a concurrent stat(2) syscall can report a number of used
blocks that accounts for the new extent twice.
Example reproducer:
$ cat reproducer-2.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
touch $MNT/foobar
write_size=$((64 * 1024))
for ((i = 0; i < 16384; i++)); do
offset=$(($i * $write_size))
xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null
blocks_used=$(stat -c %b $MNT/foobar)
# Fsync the file to trigger writeback and keep calling stat(2) on it
# to see if the number of blocks used changes.
stat_loop $MNT/foobar $blocks_used &
loop_pid=$!
xfs_io -c "fsync" $MNT/foobar
kill $loop_pid &> /dev/null
wait $loop_pid
done
umount $DEV
$ ./reproducer-2.sh
ERROR: unexpected used blocks (got: 265472 expected: 265344)
ERROR: unexpected used blocks (got: 284032 expected: 283904)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 3
Another case where such problems happen is during other operations that
replace extents in a file range with other extents. Those operations are
extent cloning, deduplication and fallocate's zero range operation.
The cause of the problem is similar to the first case. When we drop the
extents from a range, we decrement the inode's number of bytes, and later
on, after inserting the new extents we increment it. Since this is not
done atomically, a concurrent stat(2) call can see and return a number of
used blocks that is smaller than it should be, does not match the number
of used blocks before or after the clone/deduplication/zero operation.
Like for the first case, when doing a clone, deduplication or zero range
operation against an entire file, we end up having a time window where we
can report 0 used blocks to a stat(2) call.
Example reproducer:
$ cat reproducer-3.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f -m reflink=1 $DEV > /dev/null
mount $DEV $MNT
extent_size=$((64 * 1024))
num_extents=16384
file_size=$(($extent_size * $num_extents))
# File foo has many small extents.
xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \
> /dev/null
# File bar has much less extents and has exactly the same data as foo.
xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null
expected=$(stat -c %b $MNT/foo)
# Now deduplicate bar into foo. While the deduplication is in progres,
# the number of used blocks/file size reported by stat should not change
xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null &
dedupe_pid=$!
while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do
used=$(stat -c %b $MNT/foo)
if [ $used -ne $expected ]; then
echo "Unexpected blocks used: $used (expected: $expected)"
fi
done
umount $DEV
$ ./reproducer-3.sh
Unexpected blocks used: 2076800 (expected: 2097152)
Unexpected blocks used: 2097024 (expected: 2097152)
Unexpected blocks used: 2079872 (expected: 2097152)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
So fix this by:
1) Making btrfs_drop_extents() not decrement the VFS inode's number of
bytes, and instead return the number of bytes;
2) Making any code that drops extents and adds new extents update the
inode's number of bytes atomically, while holding the btrfs inode's
spinlock, which is also used by the stat(2) callback to get the inode's
number of bytes;
3) For ranges in the inode's iotree that are marked as 'delalloc new',
corresponding to previously unallocated ranges, increment the inode's
number of bytes when clearing the 'delalloc new' bit from the range,
in the same critical section that decrements the inode's
'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock.
An alternative would be to have btrfs_getattr() wait for any IO (ordered
extents in progress) and locking the whole range (0 to (u64)-1) while it
it computes the number of blocks used. But that would mean blocking
stat(2), which is a very used syscall and expected to be fast, waiting
for writes, clone/dedupe, fallocate, page reads, fiemap, etc.
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 19:07:34 +08:00
|
|
|
/*
|
|
|
|
* When an ordered extent successfully completes for a region marked as a new
|
|
|
|
* delalloc range, use this flag when clearing a new delalloc range to indicate
|
|
|
|
* that the VFS' inode number of bytes should be incremented and the inode's new
|
|
|
|
* delalloc bytes decremented, in an atomic way to prevent races with stat(2).
|
|
|
|
*/
|
|
|
|
#define EXTENT_ADD_INODE_BYTES (1U << 15)
|
2022-09-10 05:53:47 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Set during truncate when we're clearing an entire range and we just want the
|
|
|
|
* extent states to go away.
|
|
|
|
*/
|
|
|
|
#define EXTENT_CLEAR_ALL_BITS (1U << 16)
|
|
|
|
|
2019-09-23 22:05:19 +08:00
|
|
|
#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
|
|
|
|
EXTENT_CLEAR_DATA_RESV)
|
btrfs: update the number of bytes used by an inode atomically
There are several occasions where we do not update the inode's number of
used bytes atomically, resulting in a concurrent stat(2) syscall to report
a value of used blocks that does not correspond to a valid value, that is,
a value that does not match neither what we had before the operation nor
what we get after the operation completes.
In extreme cases it can result in stat(2) reporting zero used blocks, which
can cause problems for some userspace tools where they can consider a file
with a non-zero size and zero used blocks as completely sparse and skip
reading data, as reported/discussed a long time ago in some threads like
the following:
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
The cases where this can happen are the following:
-> Case 1
If we do a write (buffered or direct IO) against a file region for which
there is already an allocated extent (or multiple extents), then we have a
short time window where we can report a number of used blocks to stat(2)
that does not take into account the file region being overwritten. This
short time window happens when completing the ordered extent(s).
This happens because when we drop the extents in the write range we
decrement the inode's number of bytes and later on when we insert the new
extent(s) we increment the number of bytes in the inode, resulting in a
short time window where a stat(2) syscall can get an incorrect number of
used blocks.
If we do writes that overwrite an entire file, then we have a short time
window where we report 0 used blocks to stat(2).
Example reproducer:
$ cat reproducer-1.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
expected=$(stat -c %b $MNT/foobar)
# Create a process to keep calling stat(2) on the file and see if the
# reported number of blocks used (disk space used) changes, it should
# not because we are not increasing the file size nor punching holes.
stat_loop $MNT/foobar $expected &
loop_pid=$!
for ((i = 0; i < 50000; i++)); do
xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
done
kill $loop_pid &> /dev/null
wait
umount $DEV
$ ./reproducer-1.sh
ERROR: unexpected used blocks (got: 0 expected: 128)
ERROR: unexpected used blocks (got: 0 expected: 128)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 2
If we do a buffered write against a file region that does not have any
allocated extents, like a hole or beyond EOF, then during ordered extent
completion we have a short time window where a concurrent stat(2) syscall
can report a number of used blocks that does not correspond to the value
before or after the write operation, a value that is actually larger than
the value after the write completes.
This happens because once we start a buffered write into an unallocated
file range we increment the inode's 'new_delalloc_bytes', to make sure
any stat(2) call gets a correct used blocks value before delalloc is
flushed and completes. However at ordered extent completion, after we
inserted the new extent, we increment the inode's number of bytes used
with the size of the new extent, and only later, when clearing the range
in the inode's iotree, we decrement the inode's 'new_delalloc_bytes'
counter with the size of the extent. So this results in a short time
window where a concurrent stat(2) syscall can report a number of used
blocks that accounts for the new extent twice.
Example reproducer:
$ cat reproducer-2.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
touch $MNT/foobar
write_size=$((64 * 1024))
for ((i = 0; i < 16384; i++)); do
offset=$(($i * $write_size))
xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null
blocks_used=$(stat -c %b $MNT/foobar)
# Fsync the file to trigger writeback and keep calling stat(2) on it
# to see if the number of blocks used changes.
stat_loop $MNT/foobar $blocks_used &
loop_pid=$!
xfs_io -c "fsync" $MNT/foobar
kill $loop_pid &> /dev/null
wait $loop_pid
done
umount $DEV
$ ./reproducer-2.sh
ERROR: unexpected used blocks (got: 265472 expected: 265344)
ERROR: unexpected used blocks (got: 284032 expected: 283904)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 3
Another case where such problems happen is during other operations that
replace extents in a file range with other extents. Those operations are
extent cloning, deduplication and fallocate's zero range operation.
The cause of the problem is similar to the first case. When we drop the
extents from a range, we decrement the inode's number of bytes, and later
on, after inserting the new extents we increment it. Since this is not
done atomically, a concurrent stat(2) call can see and return a number of
used blocks that is smaller than it should be, does not match the number
of used blocks before or after the clone/deduplication/zero operation.
Like for the first case, when doing a clone, deduplication or zero range
operation against an entire file, we end up having a time window where we
can report 0 used blocks to a stat(2) call.
Example reproducer:
$ cat reproducer-3.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f -m reflink=1 $DEV > /dev/null
mount $DEV $MNT
extent_size=$((64 * 1024))
num_extents=16384
file_size=$(($extent_size * $num_extents))
# File foo has many small extents.
xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \
> /dev/null
# File bar has much less extents and has exactly the same data as foo.
xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null
expected=$(stat -c %b $MNT/foo)
# Now deduplicate bar into foo. While the deduplication is in progres,
# the number of used blocks/file size reported by stat should not change
xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null &
dedupe_pid=$!
while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do
used=$(stat -c %b $MNT/foo)
if [ $used -ne $expected ]; then
echo "Unexpected blocks used: $used (expected: $expected)"
fi
done
umount $DEV
$ ./reproducer-3.sh
Unexpected blocks used: 2076800 (expected: 2097152)
Unexpected blocks used: 2097024 (expected: 2097152)
Unexpected blocks used: 2079872 (expected: 2097152)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
So fix this by:
1) Making btrfs_drop_extents() not decrement the VFS inode's number of
bytes, and instead return the number of bytes;
2) Making any code that drops extents and adds new extents update the
inode's number of bytes atomically, while holding the btrfs inode's
spinlock, which is also used by the stat(2) callback to get the inode's
number of bytes;
3) For ranges in the inode's iotree that are marked as 'delalloc new',
corresponding to previously unallocated ranges, increment the inode's
number of bytes when clearing the 'delalloc new' bit from the range,
in the same critical section that decrements the inode's
'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock.
An alternative would be to have btrfs_getattr() wait for any IO (ordered
extents in progress) and locking the whole range (0 to (u64)-1) while it
it computes the number of blocks used. But that would mean blocking
stat(2), which is a very used syscall and expected to be fast, waiting
for writes, clone/dedupe, fallocate, page reads, fiemap, etc.
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 19:07:34 +08:00
|
|
|
#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | \
|
2022-09-10 05:53:47 +08:00
|
|
|
EXTENT_ADD_INODE_BYTES | \
|
|
|
|
EXTENT_CLEAR_ALL_BITS)
|
2019-09-23 22:05:19 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Redefined bits above which are used only in the device allocation tree,
|
|
|
|
* shouldn't be using EXTENT_LOCKED / EXTENT_BOUNDARY / EXTENT_CLEAR_META_RESV
|
|
|
|
* / EXTENT_CLEAR_DATA_RESV because they have special meaning to the bit
|
|
|
|
* manipulation functions
|
|
|
|
*/
|
|
|
|
#define CHUNK_ALLOCATED EXTENT_DIRTY
|
|
|
|
#define CHUNK_TRIMMED EXTENT_DEFRAG
|
2020-07-31 19:29:11 +08:00
|
|
|
#define CHUNK_STATE_MASK (CHUNK_ALLOCATED | \
|
|
|
|
CHUNK_TRIMMED)
|
2019-09-23 22:05:19 +08:00
|
|
|
|
|
|
|
enum {
|
2020-01-20 22:09:18 +08:00
|
|
|
IO_TREE_FS_PINNED_EXTENTS,
|
|
|
|
IO_TREE_FS_EXCLUDED_EXTENTS,
|
2020-09-15 13:35:27 +08:00
|
|
|
IO_TREE_BTREE_INODE_IO,
|
2019-09-23 22:05:19 +08:00
|
|
|
IO_TREE_INODE_IO,
|
|
|
|
IO_TREE_RELOC_BLOCKS,
|
|
|
|
IO_TREE_TRANS_DIRTY_PAGES,
|
|
|
|
IO_TREE_ROOT_DIRTY_LOG_PAGES,
|
2020-01-17 22:02:21 +08:00
|
|
|
IO_TREE_INODE_FILE_EXTENT,
|
btrfs: fix corrupt log due to concurrent fsync of inodes with shared extents
When we have extents shared amongst different inodes in the same subvolume,
if we fsync them in parallel we can end up with checksum items in the log
tree that represent ranges which overlap.
For example, consider we have inodes A and B, both sharing an extent that
covers the logical range from X to X + 64KiB:
1) Task A starts an fsync on inode A;
2) Task B starts an fsync on inode B;
3) Task A calls btrfs_csum_file_blocks(), and the first search in the
log tree, through btrfs_lookup_csum(), returns -EFBIG because it
finds an existing checksum item that covers the range from X - 64KiB
to X;
4) Task A checks that the checksum item has not reached the maximum
possible size (MAX_CSUM_ITEMS) and then releases the search path
before it does another path search for insertion (through a direct
call to btrfs_search_slot());
5) As soon as task A releases the path and before it does the search
for insertion, task B calls btrfs_csum_file_blocks() and gets -EFBIG
too, because there is an existing checksum item that has an end
offset that matches the start offset (X) of the checksum range we want
to log;
6) Task B releases the path;
7) Task A does the path search for insertion (through btrfs_search_slot())
and then verifies that the checksum item that ends at offset X still
exists and extends its size to insert the checksums for the range from
X to X + 64KiB;
8) Task A releases the path and returns from btrfs_csum_file_blocks(),
having inserted the checksums into an existing checksum item that got
its size extended. At this point we have one checksum item in the log
tree that covers the logical range from X - 64KiB to X + 64KiB;
9) Task B now does a search for insertion using btrfs_search_slot() too,
but it finds that the previous checksum item no longer ends at the
offset X, it now ends at an of offset X + 64KiB, so it leaves that item
untouched.
Then it releases the path and calls btrfs_insert_empty_item()
that inserts a checksum item with a key offset corresponding to X and
a size for inserting a single checksum (4 bytes in case of crc32c).
Subsequent iterations end up extending this new checksum item so that
it contains the checksums for the range from X to X + 64KiB.
So after task B returns from btrfs_csum_file_blocks() we end up with
two checksum items in the log tree that have overlapping ranges, one
for the range from X - 64KiB to X + 64KiB, and another for the range
from X to X + 64KiB.
Having checksum items that represent ranges which overlap, regardless of
being in the log tree or in the chekcsums tree, can lead to problems where
checksums for a file range end up not being found. This type of problem
has happened a few times in the past and the following commits fixed them
and explain in detail why having checksum items with overlapping ranges is
problematic:
27b9a8122ff71a "Btrfs: fix csum tree corruption, duplicate and outdated checksums"
b84b8390d6009c "Btrfs: fix file read corruption after extent cloning and fsync"
40e046acbd2f36 "Btrfs: fix missing data checksums after replaying a log tree"
Since this specific instance of the problem can only happen when logging
inodes, because it is the only case where concurrent attempts to insert
checksums for the same range can happen, fix the issue by using an extent
io tree as a range lock to serialize checksum insertion during inode
logging.
This issue could often be reproduced by the test case generic/457 from
fstests. When it happens it produces the following trace:
BTRFS critical (device dm-0): corrupt leaf: root=18446744073709551610 block=30625792 slot=42, csum end range (15020032) goes beyond the start range (15015936) of the next csum item
BTRFS info (device dm-0): leaf 30625792 gen 7 total ptrs 49 free space 2402 owner 18446744073709551610
BTRFS info (device dm-0): refs 1 lock (w:0 r:0 bw:0 br:0 sw:0 sr:0) lock_owner 0 current 15884
item 0 key (18446744073709551606 128 13979648) itemoff 3991 itemsize 4
item 1 key (18446744073709551606 128 13983744) itemoff 3987 itemsize 4
item 2 key (18446744073709551606 128 13987840) itemoff 3983 itemsize 4
item 3 key (18446744073709551606 128 13991936) itemoff 3979 itemsize 4
item 4 key (18446744073709551606 128 13996032) itemoff 3975 itemsize 4
item 5 key (18446744073709551606 128 14000128) itemoff 3971 itemsize 4
(...)
BTRFS error (device dm-0): block=30625792 write time tree block corruption detected
------------[ cut here ]------------
WARNING: CPU: 1 PID: 15884 at fs/btrfs/disk-io.c:539 btree_csum_one_bio+0x268/0x2d0 [btrfs]
Modules linked in: btrfs dm_thin_pool ...
CPU: 1 PID: 15884 Comm: fsx Tainted: G W 5.6.0-rc7-btrfs-next-58 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
RIP: 0010:btree_csum_one_bio+0x268/0x2d0 [btrfs]
Code: c7 c7 ...
RSP: 0018:ffffbb0109e6f8e0 EFLAGS: 00010296
RAX: 0000000000000000 RBX: ffffe1c0847b6080 RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffffffffaa963988 RDI: 0000000000000001
RBP: ffff956a4f4d2000 R08: 0000000000000000 R09: 0000000000000001
R10: 0000000000000526 R11: 0000000000000000 R12: ffff956a5cd28bb0
R13: 0000000000000000 R14: ffff956a649c9388 R15: 000000011ed82000
FS: 00007fb419959e80(0000) GS:ffff956a7aa00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000fe6d54 CR3: 0000000138696005 CR4: 00000000003606e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
btree_submit_bio_hook+0x67/0xc0 [btrfs]
submit_one_bio+0x31/0x50 [btrfs]
btree_write_cache_pages+0x2db/0x4b0 [btrfs]
? __filemap_fdatawrite_range+0xb1/0x110
do_writepages+0x23/0x80
__filemap_fdatawrite_range+0xd2/0x110
btrfs_write_marked_extents+0x15e/0x180 [btrfs]
btrfs_sync_log+0x206/0x10a0 [btrfs]
? kmem_cache_free+0x315/0x3b0
? btrfs_log_inode+0x1e8/0xf90 [btrfs]
? __mutex_unlock_slowpath+0x45/0x2a0
? lockref_put_or_lock+0x9/0x30
? dput+0x2d/0x580
? dput+0xb5/0x580
? btrfs_sync_file+0x464/0x4d0 [btrfs]
btrfs_sync_file+0x464/0x4d0 [btrfs]
do_fsync+0x38/0x60
__x64_sys_fsync+0x10/0x20
do_syscall_64+0x5c/0x280
entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x7fb41953a6d0
Code: 48 3d ...
RSP: 002b:00007ffcc86bd218 EFLAGS: 00000246 ORIG_RAX: 000000000000004a
RAX: ffffffffffffffda RBX: 000000000000000d RCX: 00007fb41953a6d0
RDX: 0000000000000009 RSI: 0000000000040000 RDI: 0000000000000003
RBP: 0000000000040000 R08: 0000000000000001 R09: 0000000000000009
R10: 0000000000000064 R11: 0000000000000246 R12: 0000556cf4b2c060
R13: 0000000000000100 R14: 0000000000000000 R15: 0000556cf322b420
irq event stamp: 0
hardirqs last enabled at (0): [<0000000000000000>] 0x0
hardirqs last disabled at (0): [<ffffffffa96bdedf>] copy_process+0x74f/0x2020
softirqs last enabled at (0): [<ffffffffa96bdedf>] copy_process+0x74f/0x2020
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace d543fc76f5ad7fd8 ]---
In that trace the tree checker detected the overlapping checksum items at
the time when we triggered writeback for the log tree when syncing the
log.
Another trace that can happen is due to BUG_ON() when deleting checksum
items while logging an inode:
BTRFS critical (device dm-0): slot 81 key (18446744073709551606 128 13635584) new key (18446744073709551606 128 13635584)
BTRFS info (device dm-0): leaf 30949376 gen 7 total ptrs 98 free space 8527 owner 18446744073709551610
BTRFS info (device dm-0): refs 4 lock (w:1 r:0 bw:0 br:0 sw:1 sr:0) lock_owner 13473 current 13473
item 0 key (257 1 0) itemoff 16123 itemsize 160
inode generation 7 size 262144 mode 100600
item 1 key (257 12 256) itemoff 16103 itemsize 20
item 2 key (257 108 0) itemoff 16050 itemsize 53
extent data disk bytenr 13631488 nr 4096
extent data offset 0 nr 131072 ram 131072
(...)
------------[ cut here ]------------
kernel BUG at fs/btrfs/ctree.c:3153!
invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI
CPU: 1 PID: 13473 Comm: fsx Not tainted 5.6.0-rc7-btrfs-next-58 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
RIP: 0010:btrfs_set_item_key_safe+0x1ea/0x270 [btrfs]
Code: 0f b6 ...
RSP: 0018:ffff95e3889179d0 EFLAGS: 00010282
RAX: 0000000000000000 RBX: 0000000000000051 RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffffffffb7763988 RDI: 0000000000000001
RBP: fffffffffffffff6 R08: 0000000000000000 R09: 0000000000000001
R10: 00000000000009ef R11: 0000000000000000 R12: ffff8912a8ba5a08
R13: ffff95e388917a06 R14: ffff89138dcf68c8 R15: ffff95e388917ace
FS: 00007fe587084e80(0000) GS:ffff8913baa00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fe587091000 CR3: 0000000126dac005 CR4: 00000000003606e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
btrfs_del_csums+0x2f4/0x540 [btrfs]
copy_items+0x4b5/0x560 [btrfs]
btrfs_log_inode+0x910/0xf90 [btrfs]
btrfs_log_inode_parent+0x2a0/0xe40 [btrfs]
? dget_parent+0x5/0x370
btrfs_log_dentry_safe+0x4a/0x70 [btrfs]
btrfs_sync_file+0x42b/0x4d0 [btrfs]
__x64_sys_msync+0x199/0x200
do_syscall_64+0x5c/0x280
entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x7fe586c65760
Code: 00 f7 ...
RSP: 002b:00007ffe250f98b8 EFLAGS: 00000246 ORIG_RAX: 000000000000001a
RAX: ffffffffffffffda RBX: 00000000000040e1 RCX: 00007fe586c65760
RDX: 0000000000000004 RSI: 0000000000006b51 RDI: 00007fe58708b000
RBP: 0000000000006a70 R08: 0000000000000003 R09: 00007fe58700cb61
R10: 0000000000000100 R11: 0000000000000246 R12: 00000000000000e1
R13: 00007fe58708b000 R14: 0000000000006b51 R15: 0000558de021a420
Modules linked in: dm_log_writes ...
---[ end trace c92a7f447a8515f5 ]---
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-05-18 19:14:50 +08:00
|
|
|
IO_TREE_LOG_CSUM_RANGE,
|
2019-09-23 22:05:19 +08:00
|
|
|
IO_TREE_SELFTEST,
|
2020-08-20 15:42:46 +08:00
|
|
|
IO_TREE_DEVICE_ALLOC_STATE,
|
2019-09-23 22:05:19 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct extent_io_tree {
|
|
|
|
struct rb_root state;
|
|
|
|
struct btrfs_fs_info *fs_info;
|
|
|
|
void *private_data;
|
|
|
|
|
|
|
|
/* Who owns this io tree, should be one of IO_TREE_* */
|
|
|
|
u8 owner;
|
|
|
|
|
|
|
|
spinlock_t lock;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct extent_state {
|
|
|
|
u64 start;
|
|
|
|
u64 end; /* inclusive */
|
|
|
|
struct rb_node rb_node;
|
|
|
|
|
|
|
|
/* ADD NEW ELEMENTS AFTER THIS */
|
|
|
|
wait_queue_head_t wq;
|
|
|
|
refcount_t refs;
|
2020-11-13 20:51:40 +08:00
|
|
|
u32 state;
|
2019-09-23 22:05:19 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_BTRFS_DEBUG
|
|
|
|
struct list_head leak_list;
|
|
|
|
#endif
|
|
|
|
};
|
|
|
|
|
|
|
|
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
|
|
|
|
struct extent_io_tree *tree, unsigned int owner,
|
|
|
|
void *private_data);
|
|
|
|
void extent_io_tree_release(struct extent_io_tree *tree);
|
|
|
|
|
2022-09-10 05:53:43 +08:00
|
|
|
int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
|
|
|
|
struct extent_state **cached);
|
2019-09-23 22:05:19 +08:00
|
|
|
|
2022-10-01 04:45:09 +08:00
|
|
|
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
|
|
|
|
struct extent_state **cached);
|
2019-09-23 22:05:19 +08:00
|
|
|
|
2022-09-10 05:53:18 +08:00
|
|
|
int __init extent_state_init_cachep(void);
|
|
|
|
void __cold extent_state_free_cachep(void);
|
2019-09-23 22:05:19 +08:00
|
|
|
|
|
|
|
u64 count_range_bits(struct extent_io_tree *tree,
|
|
|
|
u64 *start, u64 search_end,
|
2020-11-13 20:51:40 +08:00
|
|
|
u64 max_bytes, u32 bits, int contig);
|
2019-09-23 22:05:19 +08:00
|
|
|
|
|
|
|
void free_extent_state(struct extent_state *state);
|
|
|
|
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
|
2020-11-13 20:51:40 +08:00
|
|
|
u32 bits, int filled, struct extent_state *cached_state);
|
2019-09-23 22:05:19 +08:00
|
|
|
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
|
2020-11-13 20:51:40 +08:00
|
|
|
u32 bits, struct extent_changeset *changeset);
|
2019-09-23 22:05:19 +08:00
|
|
|
int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
|
2022-09-10 05:53:47 +08:00
|
|
|
u32 bits, struct extent_state **cached, gfp_t mask,
|
|
|
|
struct extent_changeset *changeset);
|
2019-09-23 22:05:19 +08:00
|
|
|
|
2022-09-10 05:53:23 +08:00
|
|
|
static inline int clear_extent_bit(struct extent_io_tree *tree, u64 start,
|
2022-09-10 05:53:47 +08:00
|
|
|
u64 end, u32 bits,
|
2022-09-10 05:53:23 +08:00
|
|
|
struct extent_state **cached)
|
|
|
|
{
|
2022-09-10 05:53:47 +08:00
|
|
|
return __clear_extent_bit(tree, start, end, bits, cached,
|
2022-09-10 05:53:40 +08:00
|
|
|
GFP_NOFS, NULL);
|
2022-09-10 05:53:23 +08:00
|
|
|
}
|
|
|
|
|
2022-09-10 05:53:43 +08:00
|
|
|
static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
|
|
|
|
struct extent_state **cached)
|
2019-09-23 22:05:19 +08:00
|
|
|
{
|
2022-09-10 05:53:47 +08:00
|
|
|
return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached,
|
2022-09-10 05:53:40 +08:00
|
|
|
GFP_NOFS, NULL);
|
2019-09-23 22:05:19 +08:00
|
|
|
}
|
|
|
|
|
2022-09-10 05:53:43 +08:00
|
|
|
static inline int unlock_extent_atomic(struct extent_io_tree *tree, u64 start,
|
|
|
|
u64 end, struct extent_state **cached)
|
2019-09-23 22:05:19 +08:00
|
|
|
{
|
2022-09-10 05:53:47 +08:00
|
|
|
return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached,
|
2022-09-10 05:53:40 +08:00
|
|
|
GFP_ATOMIC, NULL);
|
2019-09-23 22:05:19 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
|
2020-11-13 20:51:40 +08:00
|
|
|
u64 end, u32 bits)
|
2019-09-23 22:05:19 +08:00
|
|
|
{
|
2022-09-10 05:53:47 +08:00
|
|
|
return clear_extent_bit(tree, start, end, bits, NULL);
|
2019-09-23 22:05:19 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
|
2020-11-13 20:51:40 +08:00
|
|
|
u32 bits, struct extent_changeset *changeset);
|
2019-09-23 22:05:19 +08:00
|
|
|
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
|
2022-09-10 05:53:42 +08:00
|
|
|
u32 bits, struct extent_state **cached_state, gfp_t mask);
|
2022-09-10 05:53:23 +08:00
|
|
|
|
|
|
|
static inline int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start,
|
|
|
|
u64 end, u32 bits)
|
|
|
|
{
|
2022-09-10 05:53:42 +08:00
|
|
|
return set_extent_bit(tree, start, end, bits, NULL, GFP_NOWAIT);
|
2022-09-10 05:53:23 +08:00
|
|
|
}
|
2019-09-23 22:05:19 +08:00
|
|
|
|
|
|
|
static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
|
2020-11-13 20:51:40 +08:00
|
|
|
u64 end, u32 bits)
|
2019-09-23 22:05:19 +08:00
|
|
|
{
|
2022-09-10 05:53:42 +08:00
|
|
|
return set_extent_bit(tree, start, end, bits, NULL, GFP_NOFS);
|
2019-09-23 22:05:19 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
|
|
|
|
u64 end, struct extent_state **cached_state)
|
|
|
|
{
|
2022-09-10 05:53:47 +08:00
|
|
|
return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
|
2022-09-10 05:53:40 +08:00
|
|
|
cached_state, GFP_NOFS, NULL);
|
2019-09-23 22:05:19 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
|
|
|
|
u64 end, gfp_t mask)
|
|
|
|
{
|
2022-09-10 05:53:42 +08:00
|
|
|
return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, mask);
|
2019-09-23 22:05:19 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
|
|
|
|
u64 end, struct extent_state **cached)
|
|
|
|
{
|
|
|
|
return clear_extent_bit(tree, start, end,
|
|
|
|
EXTENT_DIRTY | EXTENT_DELALLOC |
|
2022-09-10 05:53:47 +08:00
|
|
|
EXTENT_DO_ACCOUNTING, cached);
|
2019-09-23 22:05:19 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
|
2020-11-13 20:51:40 +08:00
|
|
|
u32 bits, u32 clear_bits,
|
2019-09-23 22:05:19 +08:00
|
|
|
struct extent_state **cached_state);
|
|
|
|
|
|
|
|
static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
|
2020-11-13 20:51:40 +08:00
|
|
|
u64 end, u32 extra_bits,
|
2019-09-23 22:05:19 +08:00
|
|
|
struct extent_state **cached_state)
|
|
|
|
{
|
|
|
|
return set_extent_bit(tree, start, end,
|
btrfs: remove unnecessary EXTENT_UPTODATE state in buffered I/O path
After we copied data to page cache in buffered I/O, we
1. Insert a EXTENT_UPTODATE state into inode's io_tree, by
endio_readpage_release_extent(), set_extent_delalloc() or
set_extent_defrag().
2. Set page uptodate before we unlock the page.
But the only place we check io_tree's EXTENT_UPTODATE state is in
btrfs_do_readpage(). We know we enter btrfs_do_readpage() only when we
have a non-uptodate page, so it is unnecessary to set EXTENT_UPTODATE.
For example, when performing a buffered random read:
fio --rw=randread --ioengine=libaio --direct=0 --numjobs=4 \
--filesize=32G --size=4G --bs=4k --name=job \
--filename=/mnt/file --name=job
Then check how many extent_state in io_tree:
cat /proc/slabinfo | grep btrfs_extent_state | awk '{print $2}'
w/o this patch, we got 640567 btrfs_extent_state.
w/ this patch, we got 204 btrfs_extent_state.
Maintaining such a big tree brings overhead since every I/O needs to insert
EXTENT_LOCKED, insert EXTENT_UPTODATE, then remove EXTENT_LOCKED. And in
every insert or remove, we need to lock io_tree, do tree search, alloc or
dealloc extent states. By removing unnecessary EXTENT_UPTODATE, we keep
io_tree in a minimal size and reduce overhead when performing buffered I/O.
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Ethan Lien <ethanlien@synology.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-08-19 10:44:08 +08:00
|
|
|
EXTENT_DELALLOC | extra_bits,
|
2022-09-10 05:53:42 +08:00
|
|
|
cached_state, GFP_NOFS);
|
2019-09-23 22:05:19 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start,
|
|
|
|
u64 end, struct extent_state **cached_state)
|
|
|
|
{
|
|
|
|
return set_extent_bit(tree, start, end,
|
btrfs: remove unnecessary EXTENT_UPTODATE state in buffered I/O path
After we copied data to page cache in buffered I/O, we
1. Insert a EXTENT_UPTODATE state into inode's io_tree, by
endio_readpage_release_extent(), set_extent_delalloc() or
set_extent_defrag().
2. Set page uptodate before we unlock the page.
But the only place we check io_tree's EXTENT_UPTODATE state is in
btrfs_do_readpage(). We know we enter btrfs_do_readpage() only when we
have a non-uptodate page, so it is unnecessary to set EXTENT_UPTODATE.
For example, when performing a buffered random read:
fio --rw=randread --ioengine=libaio --direct=0 --numjobs=4 \
--filesize=32G --size=4G --bs=4k --name=job \
--filename=/mnt/file --name=job
Then check how many extent_state in io_tree:
cat /proc/slabinfo | grep btrfs_extent_state | awk '{print $2}'
w/o this patch, we got 640567 btrfs_extent_state.
w/ this patch, we got 204 btrfs_extent_state.
Maintaining such a big tree brings overhead since every I/O needs to insert
EXTENT_LOCKED, insert EXTENT_UPTODATE, then remove EXTENT_LOCKED. And in
every insert or remove, we need to lock io_tree, do tree search, alloc or
dealloc extent states. By removing unnecessary EXTENT_UPTODATE, we keep
io_tree in a minimal size and reduce overhead when performing buffered I/O.
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Ethan Lien <ethanlien@synology.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-08-19 10:44:08 +08:00
|
|
|
EXTENT_DELALLOC | EXTENT_DEFRAG,
|
2022-09-10 05:53:42 +08:00
|
|
|
cached_state, GFP_NOFS);
|
2019-09-23 22:05:19 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
|
|
|
|
u64 end)
|
|
|
|
{
|
2022-09-10 05:53:42 +08:00
|
|
|
return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, GFP_NOFS);
|
2019-09-23 22:05:19 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start,
|
|
|
|
u64 end, struct extent_state **cached_state, gfp_t mask)
|
|
|
|
{
|
2022-09-10 05:53:41 +08:00
|
|
|
return set_extent_bit(tree, start, end, EXTENT_UPTODATE,
|
2022-09-10 05:53:42 +08:00
|
|
|
cached_state, mask);
|
2019-09-23 22:05:19 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
|
2020-11-13 20:51:40 +08:00
|
|
|
u64 *start_ret, u64 *end_ret, u32 bits,
|
2019-09-23 22:05:19 +08:00
|
|
|
struct extent_state **cached_state);
|
|
|
|
void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
|
2020-11-13 20:51:40 +08:00
|
|
|
u64 *start_ret, u64 *end_ret, u32 bits);
|
2020-01-17 22:02:21 +08:00
|
|
|
int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
|
2020-11-13 20:51:40 +08:00
|
|
|
u64 *start_ret, u64 *end_ret, u32 bits);
|
2019-09-23 22:05:20 +08:00
|
|
|
bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
|
|
|
|
u64 *end, u64 max_bytes,
|
|
|
|
struct extent_state **cached_state);
|
2022-10-01 04:45:12 +08:00
|
|
|
void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
|
|
|
|
struct extent_state **cached_state);
|
2019-09-23 22:05:19 +08:00
|
|
|
|
|
|
|
#endif /* BTRFS_EXTENT_IO_TREE_H */
|