linux/fs/ext4/balloc.c

858 lines
25 KiB
C
Raw Normal View History

/*
* linux/fs/ext4/balloc.c
*
* Copyright (C) 1992, 1993, 1994, 1995
* Remy Card (card@masi.ibp.fr)
* Laboratoire MASI - Institut Blaise Pascal
* Universite Pierre et Marie Curie (Paris VI)
*
* Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
* Big-endian to little-endian byte-swapping/bitmaps by
* David S. Miller (davem@caip.rutgers.edu), 1995
*/
#include <linux/time.h>
#include <linux/capability.h>
#include <linux/fs.h>
#include <linux/jbd2.h>
#include <linux/quotaops.h>
#include <linux/buffer_head.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "mballoc.h"
#include <trace/events/ext4.h>
static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
ext4_group_t block_group);
/*
* balloc.c contains the blocks allocation and deallocation routines
*/
/*
* Calculate block group number for a given block number
*/
ext4_group_t ext4_get_group_number(struct super_block *sb,
ext4_fsblk_t block)
{
ext4_group_t group;
if (test_opt2(sb, STD_GROUP_SIZE))
ext4: fix ext4_get_group_number() The function ext4_get_group_number() was introduced as an optimization in commit bd86298e60b8. Unfortunately, this commit incorrectly calculate the group number for file systems with a 1k block size (when s_first_data_block is 1 instead of zero). This could cause the following kernel BUG: [ 568.877799] ------------[ cut here ]------------ [ 568.877833] kernel BUG at fs/ext4/mballoc.c:3728! [ 568.877840] Oops: Exception in kernel mode, sig: 5 [#1] [ 568.877845] SMP NR_CPUS=32 NUMA pSeries [ 568.877852] Modules linked in: binfmt_misc [ 568.877861] CPU: 1 PID: 3516 Comm: fs_mark Not tainted 3.10.0-03216-g7c6809f-dirty #1 [ 568.877867] task: c0000001fb0b8000 ti: c0000001fa954000 task.ti: c0000001fa954000 [ 568.877873] NIP: c0000000002f42a4 LR: c0000000002f4274 CTR: c000000000317ef8 [ 568.877879] REGS: c0000001fa956ed0 TRAP: 0700 Not tainted (3.10.0-03216-g7c6809f-dirty) [ 568.877884] MSR: 8000000000029032 <SF,EE,ME,IR,DR,RI> CR: 24000428 XER: 00000000 [ 568.877902] SOFTE: 1 [ 568.877905] CFAR: c0000000002b5464 [ 568.877908] GPR00: 0000000000000001 c0000001fa957150 c000000000c6a408 c0000001fb588000 GPR04: 0000000000003fff c0000001fa9571c0 c0000001fa9571c4 000138098c50625f GPR08: 1301200000000000 0000000000000002 0000000000000001 0000000000000000 GPR12: 0000000024000422 c00000000f33a300 0000000000008000 c0000001fa9577f0 GPR16: c0000001fb7d0100 c000000000c29190 c0000000007f46e8 c000000000a14672 GPR20: 0000000000000001 0000000000000008 ffffffffffffffff 0000000000000000 GPR24: 0000000000000100 c0000001fa957278 c0000001fdb2bc78 c0000001fa957288 GPR28: 0000000000100100 c0000001fa957288 c0000001fb588000 c0000001fdb2bd10 [ 568.877993] NIP [c0000000002f42a4] .ext4_mb_release_group_pa+0xec/0x1c0 [ 568.877999] LR [c0000000002f4274] .ext4_mb_release_group_pa+0xbc/0x1c0 [ 568.878004] Call Trace: [ 568.878008] [c0000001fa957150] [c0000000002f4274] .ext4_mb_release_group_pa+0xbc/0x1c0 (unreliable) [ 568.878017] [c0000001fa957200] [c0000000002fb070] .ext4_mb_discard_lg_preallocations+0x394/0x444 [ 568.878025] [c0000001fa957340] [c0000000002fb45c] .ext4_mb_release_context+0x33c/0x734 [ 568.878032] [c0000001fa957440] [c0000000002fbcf8] .ext4_mb_new_blocks+0x4a4/0x5f4 [ 568.878039] [c0000001fa957510] [c0000000002ef56c] .ext4_ext_map_blocks+0xc28/0x1178 [ 568.878047] [c0000001fa957640] [c0000000002c1a94] .ext4_map_blocks+0x2c8/0x490 [ 568.878054] [c0000001fa957730] [c0000000002c536c] .ext4_writepages+0x738/0xc60 [ 568.878062] [c0000001fa957950] [c000000000168a78] .do_writepages+0x5c/0x80 [ 568.878069] [c0000001fa9579d0] [c00000000015d1c4] .__filemap_fdatawrite_range+0x88/0xb0 [ 568.878078] [c0000001fa957aa0] [c00000000015d23c] .filemap_write_and_wait_range+0x50/0xfc [ 568.878085] [c0000001fa957b30] [c0000000002b8edc] .ext4_sync_file+0x220/0x3c4 [ 568.878092] [c0000001fa957be0] [c0000000001f849c] .vfs_fsync_range+0x64/0x80 [ 568.878098] [c0000001fa957c70] [c0000000001f84f0] .vfs_fsync+0x38/0x4c [ 568.878105] [c0000001fa957d00] [c0000000001f87f4] .do_fsync+0x54/0x90 [ 568.878111] [c0000001fa957db0] [c0000000001f8894] .SyS_fsync+0x28/0x3c [ 568.878120] [c0000001fa957e30] [c000000000009c88] syscall_exit+0x0/0x7c [ 568.878125] Instruction dump: [ 568.878130] 60000000 813d0034 81610070 38000000 7f8b4800 419e001c 813f007c 7d2bfe70 [ 568.878144] 7d604a78 7c005850 54000ffe 7c0007b4 <0b000000> e8a10076 e87f0090 7fa4eb78 [ 568.878160] ---[ end trace 594d911d9654770b ]--- In addition fix the STD_GROUP optimization so that it works for bigalloc file systems as well. Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> Reported-by: Li Zhong <lizhongfs@gmail.com> Reviewed-by: Lukas Czerner <lczerner@redhat.com> Cc: stable@vger.kernel.org # 3.10
2013-07-06 11:11:16 +08:00
group = (block -
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) >>
(EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3);
else
ext4_get_group_no_and_offset(sb, block, &group, NULL);
return group;
}
/*
* Calculate the block group number and offset into the block/cluster
* allocation bitmap, given a block number
*/
void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp)
{
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
ext4_grpblk_t offset;
blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >>
EXT4_SB(sb)->s_cluster_bits;
if (offsetp)
*offsetp = offset;
if (blockgrpp)
*blockgrpp = blocknr;
}
/*
* Check whether the 'block' lives within the 'block_group'. Returns 1 if so
* and 0 otherwise.
*/
static inline int ext4_block_in_group(struct super_block *sb,
ext4_fsblk_t block,
ext4_group_t block_group)
{
ext4_group_t actual_group;
actual_group = ext4_get_group_number(sb, block);
return (actual_group == block_group) ? 1 : 0;
}
/* Return the number of clusters used for file system metadata; this
* represents the overhead needed by the file system.
*/
unsigned ext4_num_overhead_clusters(struct super_block *sb,
ext4_group_t block_group,
struct ext4_group_desc *gdp)
{
unsigned num_clusters;
int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c;
ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group);
ext4_fsblk_t itbl_blk;
struct ext4_sb_info *sbi = EXT4_SB(sb);
/* This is the number of clusters used by the superblock,
* block group descriptors, and reserved block group
* descriptor blocks */
num_clusters = ext4_num_base_meta_clusters(sb, block_group);
/*
* For the allocation bitmaps and inode table, we first need
* to check to see if the block is in the block group. If it
* is, then check to see if the cluster is already accounted
* for in the clusters used for the base metadata cluster, or
* if we can increment the base metadata cluster to include
* that block. Otherwise, we will have to track the cluster
* used for the allocation bitmap or inode table explicitly.
* Normally all of these blocks are contiguous, so the special
* case handling shouldn't be necessary except for *very*
* unusual file system layouts.
*/
if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) {
block_cluster = EXT4_B2C(sbi,
ext4_block_bitmap(sb, gdp) - start);
if (block_cluster < num_clusters)
block_cluster = -1;
else if (block_cluster == num_clusters) {
num_clusters++;
block_cluster = -1;
}
}
if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) {
inode_cluster = EXT4_B2C(sbi,
ext4_inode_bitmap(sb, gdp) - start);
if (inode_cluster < num_clusters)
inode_cluster = -1;
else if (inode_cluster == num_clusters) {
num_clusters++;
inode_cluster = -1;
}
}
itbl_blk = ext4_inode_table(sb, gdp);
for (i = 0; i < sbi->s_itb_per_group; i++) {
if (ext4_block_in_group(sb, itbl_blk + i, block_group)) {
c = EXT4_B2C(sbi, itbl_blk + i - start);
if ((c < num_clusters) || (c == inode_cluster) ||
(c == block_cluster) || (c == itbl_cluster))
continue;
if (c == num_clusters) {
num_clusters++;
continue;
}
num_clusters++;
itbl_cluster = c;
}
}
if (block_cluster != -1)
num_clusters++;
if (inode_cluster != -1)
num_clusters++;
return num_clusters;
}
static unsigned int num_clusters_in_group(struct super_block *sb,
ext4_group_t block_group)
{
unsigned int blocks;
if (block_group == ext4_get_groups_count(sb) - 1) {
/*
* Even though mke2fs always initializes the first and
* last group, just in case some other tool was used,
* we need to make sure we calculate the right free
* blocks.
*/
blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) -
ext4_group_first_block_no(sb, block_group);
} else
blocks = EXT4_BLOCKS_PER_GROUP(sb);
return EXT4_NUM_B2C(EXT4_SB(sb), blocks);
}
/* Initializes an uninitialized block bitmap */
void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
ext4_group_t block_group,
struct ext4_group_desc *gdp)
Ext4: Uninitialized Block Groups In pass1 of e2fsck, every inode table in the fileystem is scanned and checked, regardless of whether it is in use. This is this the most time consuming part of the filesystem check. The unintialized block group feature can greatly reduce e2fsck time by eliminating checking of uninitialized inodes. With this feature, there is a a high water mark of used inodes for each block group. Block and inode bitmaps can be uninitialized on disk via a flag in the group descriptor to avoid reading or scanning them at e2fsck time. A checksum of each group descriptor is used to ensure that corruption in the group descriptor's bit flags does not cause incorrect operation. The feature is enabled through a mkfs option mke2fs /dev/ -O uninit_groups A patch adding support for uninitialized block groups to e2fsprogs tools has been posted to the linux-ext4 mailing list. The patches have been stress tested with fsstress and fsx. In performance tests testing e2fsck time, we have seen that e2fsck time on ext3 grows linearly with the total number of inodes in the filesytem. In ext4 with the uninitialized block groups feature, the e2fsck time is constant, based solely on the number of used inodes rather than the total inode count. Since typical ext4 filesystems only use 1-10% of their inodes, this feature can greatly reduce e2fsck time for users. With performance improvement of 2-20 times, depending on how full the filesystem is. The attached graph shows the major improvements in e2fsck times in filesystems with a large total inode count, but few inodes in use. In each group descriptor if we have EXT4_BG_INODE_UNINIT set in bg_flags: Inode table is not initialized/used in this group. So we can skip the consistency check during fsck. EXT4_BG_BLOCK_UNINIT set in bg_flags: No block in the group is used. So we can skip the block bitmap verification for this group. We also add two new fields to group descriptor as a part of uninitialized group patch. __le16 bg_itable_unused; /* Unused inodes count */ __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ bg_itable_unused: If we have EXT4_BG_INODE_UNINIT not set in bg_flags then bg_itable_unused will give the offset within the inode table till the inodes are used. This can be used by fsck to skip list of inodes that are marked unused. bg_checksum: Now that we depend on bg_flags and bg_itable_unused to determine the block and inode usage, we need to make sure group descriptor is not corrupt. We add checksum to group descriptor to detect corruption. If the descriptor is found to be corrupt, we mark all the blocks and inodes in the group used. Signed-off-by: Avantika Mathur <mathur@us.ibm.com> Signed-off-by: Andreas Dilger <adilger@clusterfs.com> Signed-off-by: Mingming Cao <cmm@us.ibm.com> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
2007-10-17 06:38:25 +08:00
{
unsigned int bit, bit_max;
Ext4: Uninitialized Block Groups In pass1 of e2fsck, every inode table in the fileystem is scanned and checked, regardless of whether it is in use. This is this the most time consuming part of the filesystem check. The unintialized block group feature can greatly reduce e2fsck time by eliminating checking of uninitialized inodes. With this feature, there is a a high water mark of used inodes for each block group. Block and inode bitmaps can be uninitialized on disk via a flag in the group descriptor to avoid reading or scanning them at e2fsck time. A checksum of each group descriptor is used to ensure that corruption in the group descriptor's bit flags does not cause incorrect operation. The feature is enabled through a mkfs option mke2fs /dev/ -O uninit_groups A patch adding support for uninitialized block groups to e2fsprogs tools has been posted to the linux-ext4 mailing list. The patches have been stress tested with fsstress and fsx. In performance tests testing e2fsck time, we have seen that e2fsck time on ext3 grows linearly with the total number of inodes in the filesytem. In ext4 with the uninitialized block groups feature, the e2fsck time is constant, based solely on the number of used inodes rather than the total inode count. Since typical ext4 filesystems only use 1-10% of their inodes, this feature can greatly reduce e2fsck time for users. With performance improvement of 2-20 times, depending on how full the filesystem is. The attached graph shows the major improvements in e2fsck times in filesystems with a large total inode count, but few inodes in use. In each group descriptor if we have EXT4_BG_INODE_UNINIT set in bg_flags: Inode table is not initialized/used in this group. So we can skip the consistency check during fsck. EXT4_BG_BLOCK_UNINIT set in bg_flags: No block in the group is used. So we can skip the block bitmap verification for this group. We also add two new fields to group descriptor as a part of uninitialized group patch. __le16 bg_itable_unused; /* Unused inodes count */ __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ bg_itable_unused: If we have EXT4_BG_INODE_UNINIT not set in bg_flags then bg_itable_unused will give the offset within the inode table till the inodes are used. This can be used by fsck to skip list of inodes that are marked unused. bg_checksum: Now that we depend on bg_flags and bg_itable_unused to determine the block and inode usage, we need to make sure group descriptor is not corrupt. We add checksum to group descriptor to detect corruption. If the descriptor is found to be corrupt, we mark all the blocks and inodes in the group used. Signed-off-by: Avantika Mathur <mathur@us.ibm.com> Signed-off-by: Andreas Dilger <adilger@clusterfs.com> Signed-off-by: Mingming Cao <cmm@us.ibm.com> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
2007-10-17 06:38:25 +08:00
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_fsblk_t start, tmp;
int flex_bg = 0;
struct ext4_group_info *grp;
J_ASSERT_BH(bh, buffer_locked(bh));
/* If checksum is bad mark all blocks used to prevent allocation
* essentially implementing a per-group read-only flag. */
if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
ext4_error(sb, "Checksum bad for group %u", block_group);
grp = ext4_get_group_info(sb, block_group);
set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
return;
Ext4: Uninitialized Block Groups In pass1 of e2fsck, every inode table in the fileystem is scanned and checked, regardless of whether it is in use. This is this the most time consuming part of the filesystem check. The unintialized block group feature can greatly reduce e2fsck time by eliminating checking of uninitialized inodes. With this feature, there is a a high water mark of used inodes for each block group. Block and inode bitmaps can be uninitialized on disk via a flag in the group descriptor to avoid reading or scanning them at e2fsck time. A checksum of each group descriptor is used to ensure that corruption in the group descriptor's bit flags does not cause incorrect operation. The feature is enabled through a mkfs option mke2fs /dev/ -O uninit_groups A patch adding support for uninitialized block groups to e2fsprogs tools has been posted to the linux-ext4 mailing list. The patches have been stress tested with fsstress and fsx. In performance tests testing e2fsck time, we have seen that e2fsck time on ext3 grows linearly with the total number of inodes in the filesytem. In ext4 with the uninitialized block groups feature, the e2fsck time is constant, based solely on the number of used inodes rather than the total inode count. Since typical ext4 filesystems only use 1-10% of their inodes, this feature can greatly reduce e2fsck time for users. With performance improvement of 2-20 times, depending on how full the filesystem is. The attached graph shows the major improvements in e2fsck times in filesystems with a large total inode count, but few inodes in use. In each group descriptor if we have EXT4_BG_INODE_UNINIT set in bg_flags: Inode table is not initialized/used in this group. So we can skip the consistency check during fsck. EXT4_BG_BLOCK_UNINIT set in bg_flags: No block in the group is used. So we can skip the block bitmap verification for this group. We also add two new fields to group descriptor as a part of uninitialized group patch. __le16 bg_itable_unused; /* Unused inodes count */ __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ bg_itable_unused: If we have EXT4_BG_INODE_UNINIT not set in bg_flags then bg_itable_unused will give the offset within the inode table till the inodes are used. This can be used by fsck to skip list of inodes that are marked unused. bg_checksum: Now that we depend on bg_flags and bg_itable_unused to determine the block and inode usage, we need to make sure group descriptor is not corrupt. We add checksum to group descriptor to detect corruption. If the descriptor is found to be corrupt, we mark all the blocks and inodes in the group used. Signed-off-by: Avantika Mathur <mathur@us.ibm.com> Signed-off-by: Andreas Dilger <adilger@clusterfs.com> Signed-off-by: Mingming Cao <cmm@us.ibm.com> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
2007-10-17 06:38:25 +08:00
}
memset(bh->b_data, 0, sb->s_blocksize);
Ext4: Uninitialized Block Groups In pass1 of e2fsck, every inode table in the fileystem is scanned and checked, regardless of whether it is in use. This is this the most time consuming part of the filesystem check. The unintialized block group feature can greatly reduce e2fsck time by eliminating checking of uninitialized inodes. With this feature, there is a a high water mark of used inodes for each block group. Block and inode bitmaps can be uninitialized on disk via a flag in the group descriptor to avoid reading or scanning them at e2fsck time. A checksum of each group descriptor is used to ensure that corruption in the group descriptor's bit flags does not cause incorrect operation. The feature is enabled through a mkfs option mke2fs /dev/ -O uninit_groups A patch adding support for uninitialized block groups to e2fsprogs tools has been posted to the linux-ext4 mailing list. The patches have been stress tested with fsstress and fsx. In performance tests testing e2fsck time, we have seen that e2fsck time on ext3 grows linearly with the total number of inodes in the filesytem. In ext4 with the uninitialized block groups feature, the e2fsck time is constant, based solely on the number of used inodes rather than the total inode count. Since typical ext4 filesystems only use 1-10% of their inodes, this feature can greatly reduce e2fsck time for users. With performance improvement of 2-20 times, depending on how full the filesystem is. The attached graph shows the major improvements in e2fsck times in filesystems with a large total inode count, but few inodes in use. In each group descriptor if we have EXT4_BG_INODE_UNINIT set in bg_flags: Inode table is not initialized/used in this group. So we can skip the consistency check during fsck. EXT4_BG_BLOCK_UNINIT set in bg_flags: No block in the group is used. So we can skip the block bitmap verification for this group. We also add two new fields to group descriptor as a part of uninitialized group patch. __le16 bg_itable_unused; /* Unused inodes count */ __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ bg_itable_unused: If we have EXT4_BG_INODE_UNINIT not set in bg_flags then bg_itable_unused will give the offset within the inode table till the inodes are used. This can be used by fsck to skip list of inodes that are marked unused. bg_checksum: Now that we depend on bg_flags and bg_itable_unused to determine the block and inode usage, we need to make sure group descriptor is not corrupt. We add checksum to group descriptor to detect corruption. If the descriptor is found to be corrupt, we mark all the blocks and inodes in the group used. Signed-off-by: Avantika Mathur <mathur@us.ibm.com> Signed-off-by: Andreas Dilger <adilger@clusterfs.com> Signed-off-by: Mingming Cao <cmm@us.ibm.com> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
2007-10-17 06:38:25 +08:00
bit_max = ext4_num_base_meta_clusters(sb, block_group);
for (bit = 0; bit < bit_max; bit++)
ext4_set_bit(bit, bh->b_data);
start = ext4_group_first_block_no(sb, block_group);
Ext4: Uninitialized Block Groups In pass1 of e2fsck, every inode table in the fileystem is scanned and checked, regardless of whether it is in use. This is this the most time consuming part of the filesystem check. The unintialized block group feature can greatly reduce e2fsck time by eliminating checking of uninitialized inodes. With this feature, there is a a high water mark of used inodes for each block group. Block and inode bitmaps can be uninitialized on disk via a flag in the group descriptor to avoid reading or scanning them at e2fsck time. A checksum of each group descriptor is used to ensure that corruption in the group descriptor's bit flags does not cause incorrect operation. The feature is enabled through a mkfs option mke2fs /dev/ -O uninit_groups A patch adding support for uninitialized block groups to e2fsprogs tools has been posted to the linux-ext4 mailing list. The patches have been stress tested with fsstress and fsx. In performance tests testing e2fsck time, we have seen that e2fsck time on ext3 grows linearly with the total number of inodes in the filesytem. In ext4 with the uninitialized block groups feature, the e2fsck time is constant, based solely on the number of used inodes rather than the total inode count. Since typical ext4 filesystems only use 1-10% of their inodes, this feature can greatly reduce e2fsck time for users. With performance improvement of 2-20 times, depending on how full the filesystem is. The attached graph shows the major improvements in e2fsck times in filesystems with a large total inode count, but few inodes in use. In each group descriptor if we have EXT4_BG_INODE_UNINIT set in bg_flags: Inode table is not initialized/used in this group. So we can skip the consistency check during fsck. EXT4_BG_BLOCK_UNINIT set in bg_flags: No block in the group is used. So we can skip the block bitmap verification for this group. We also add two new fields to group descriptor as a part of uninitialized group patch. __le16 bg_itable_unused; /* Unused inodes count */ __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ bg_itable_unused: If we have EXT4_BG_INODE_UNINIT not set in bg_flags then bg_itable_unused will give the offset within the inode table till the inodes are used. This can be used by fsck to skip list of inodes that are marked unused. bg_checksum: Now that we depend on bg_flags and bg_itable_unused to determine the block and inode usage, we need to make sure group descriptor is not corrupt. We add checksum to group descriptor to detect corruption. If the descriptor is found to be corrupt, we mark all the blocks and inodes in the group used. Signed-off-by: Avantika Mathur <mathur@us.ibm.com> Signed-off-by: Andreas Dilger <adilger@clusterfs.com> Signed-off-by: Mingming Cao <cmm@us.ibm.com> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
2007-10-17 06:38:25 +08:00
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
flex_bg = 1;
Ext4: Uninitialized Block Groups In pass1 of e2fsck, every inode table in the fileystem is scanned and checked, regardless of whether it is in use. This is this the most time consuming part of the filesystem check. The unintialized block group feature can greatly reduce e2fsck time by eliminating checking of uninitialized inodes. With this feature, there is a a high water mark of used inodes for each block group. Block and inode bitmaps can be uninitialized on disk via a flag in the group descriptor to avoid reading or scanning them at e2fsck time. A checksum of each group descriptor is used to ensure that corruption in the group descriptor's bit flags does not cause incorrect operation. The feature is enabled through a mkfs option mke2fs /dev/ -O uninit_groups A patch adding support for uninitialized block groups to e2fsprogs tools has been posted to the linux-ext4 mailing list. The patches have been stress tested with fsstress and fsx. In performance tests testing e2fsck time, we have seen that e2fsck time on ext3 grows linearly with the total number of inodes in the filesytem. In ext4 with the uninitialized block groups feature, the e2fsck time is constant, based solely on the number of used inodes rather than the total inode count. Since typical ext4 filesystems only use 1-10% of their inodes, this feature can greatly reduce e2fsck time for users. With performance improvement of 2-20 times, depending on how full the filesystem is. The attached graph shows the major improvements in e2fsck times in filesystems with a large total inode count, but few inodes in use. In each group descriptor if we have EXT4_BG_INODE_UNINIT set in bg_flags: Inode table is not initialized/used in this group. So we can skip the consistency check during fsck. EXT4_BG_BLOCK_UNINIT set in bg_flags: No block in the group is used. So we can skip the block bitmap verification for this group. We also add two new fields to group descriptor as a part of uninitialized group patch. __le16 bg_itable_unused; /* Unused inodes count */ __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ bg_itable_unused: If we have EXT4_BG_INODE_UNINIT not set in bg_flags then bg_itable_unused will give the offset within the inode table till the inodes are used. This can be used by fsck to skip list of inodes that are marked unused. bg_checksum: Now that we depend on bg_flags and bg_itable_unused to determine the block and inode usage, we need to make sure group descriptor is not corrupt. We add checksum to group descriptor to detect corruption. If the descriptor is found to be corrupt, we mark all the blocks and inodes in the group used. Signed-off-by: Avantika Mathur <mathur@us.ibm.com> Signed-off-by: Andreas Dilger <adilger@clusterfs.com> Signed-off-by: Mingming Cao <cmm@us.ibm.com> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
2007-10-17 06:38:25 +08:00
/* Set bits for block and inode bitmaps, and inode table */
tmp = ext4_block_bitmap(sb, gdp);
if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
Ext4: Uninitialized Block Groups In pass1 of e2fsck, every inode table in the fileystem is scanned and checked, regardless of whether it is in use. This is this the most time consuming part of the filesystem check. The unintialized block group feature can greatly reduce e2fsck time by eliminating checking of uninitialized inodes. With this feature, there is a a high water mark of used inodes for each block group. Block and inode bitmaps can be uninitialized on disk via a flag in the group descriptor to avoid reading or scanning them at e2fsck time. A checksum of each group descriptor is used to ensure that corruption in the group descriptor's bit flags does not cause incorrect operation. The feature is enabled through a mkfs option mke2fs /dev/ -O uninit_groups A patch adding support for uninitialized block groups to e2fsprogs tools has been posted to the linux-ext4 mailing list. The patches have been stress tested with fsstress and fsx. In performance tests testing e2fsck time, we have seen that e2fsck time on ext3 grows linearly with the total number of inodes in the filesytem. In ext4 with the uninitialized block groups feature, the e2fsck time is constant, based solely on the number of used inodes rather than the total inode count. Since typical ext4 filesystems only use 1-10% of their inodes, this feature can greatly reduce e2fsck time for users. With performance improvement of 2-20 times, depending on how full the filesystem is. The attached graph shows the major improvements in e2fsck times in filesystems with a large total inode count, but few inodes in use. In each group descriptor if we have EXT4_BG_INODE_UNINIT set in bg_flags: Inode table is not initialized/used in this group. So we can skip the consistency check during fsck. EXT4_BG_BLOCK_UNINIT set in bg_flags: No block in the group is used. So we can skip the block bitmap verification for this group. We also add two new fields to group descriptor as a part of uninitialized group patch. __le16 bg_itable_unused; /* Unused inodes count */ __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ bg_itable_unused: If we have EXT4_BG_INODE_UNINIT not set in bg_flags then bg_itable_unused will give the offset within the inode table till the inodes are used. This can be used by fsck to skip list of inodes that are marked unused. bg_checksum: Now that we depend on bg_flags and bg_itable_unused to determine the block and inode usage, we need to make sure group descriptor is not corrupt. We add checksum to group descriptor to detect corruption. If the descriptor is found to be corrupt, we mark all the blocks and inodes in the group used. Signed-off-by: Avantika Mathur <mathur@us.ibm.com> Signed-off-by: Andreas Dilger <adilger@clusterfs.com> Signed-off-by: Mingming Cao <cmm@us.ibm.com> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
2007-10-17 06:38:25 +08:00
tmp = ext4_inode_bitmap(sb, gdp);
if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
tmp = ext4_inode_table(sb, gdp);
for (; tmp < ext4_inode_table(sb, gdp) +
sbi->s_itb_per_group; tmp++) {
if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data);
Ext4: Uninitialized Block Groups In pass1 of e2fsck, every inode table in the fileystem is scanned and checked, regardless of whether it is in use. This is this the most time consuming part of the filesystem check. The unintialized block group feature can greatly reduce e2fsck time by eliminating checking of uninitialized inodes. With this feature, there is a a high water mark of used inodes for each block group. Block and inode bitmaps can be uninitialized on disk via a flag in the group descriptor to avoid reading or scanning them at e2fsck time. A checksum of each group descriptor is used to ensure that corruption in the group descriptor's bit flags does not cause incorrect operation. The feature is enabled through a mkfs option mke2fs /dev/ -O uninit_groups A patch adding support for uninitialized block groups to e2fsprogs tools has been posted to the linux-ext4 mailing list. The patches have been stress tested with fsstress and fsx. In performance tests testing e2fsck time, we have seen that e2fsck time on ext3 grows linearly with the total number of inodes in the filesytem. In ext4 with the uninitialized block groups feature, the e2fsck time is constant, based solely on the number of used inodes rather than the total inode count. Since typical ext4 filesystems only use 1-10% of their inodes, this feature can greatly reduce e2fsck time for users. With performance improvement of 2-20 times, depending on how full the filesystem is. The attached graph shows the major improvements in e2fsck times in filesystems with a large total inode count, but few inodes in use. In each group descriptor if we have EXT4_BG_INODE_UNINIT set in bg_flags: Inode table is not initialized/used in this group. So we can skip the consistency check during fsck. EXT4_BG_BLOCK_UNINIT set in bg_flags: No block in the group is used. So we can skip the block bitmap verification for this group. We also add two new fields to group descriptor as a part of uninitialized group patch. __le16 bg_itable_unused; /* Unused inodes count */ __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ bg_itable_unused: If we have EXT4_BG_INODE_UNINIT not set in bg_flags then bg_itable_unused will give the offset within the inode table till the inodes are used. This can be used by fsck to skip list of inodes that are marked unused. bg_checksum: Now that we depend on bg_flags and bg_itable_unused to determine the block and inode usage, we need to make sure group descriptor is not corrupt. We add checksum to group descriptor to detect corruption. If the descriptor is found to be corrupt, we mark all the blocks and inodes in the group used. Signed-off-by: Avantika Mathur <mathur@us.ibm.com> Signed-off-by: Andreas Dilger <adilger@clusterfs.com> Signed-off-by: Mingming Cao <cmm@us.ibm.com> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
2007-10-17 06:38:25 +08:00
}
/*
* Also if the number of blocks within the group is less than
* the blocksize * 8 ( which is the size of bitmap ), set rest
* of the block bitmap to 1
*/
ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group),
sb->s_blocksize * 8, bh->b_data);
ext4_block_bitmap_csum_set(sb, block_group, gdp, bh);
ext4_group_desc_csum_set(sb, block_group, gdp);
Ext4: Uninitialized Block Groups In pass1 of e2fsck, every inode table in the fileystem is scanned and checked, regardless of whether it is in use. This is this the most time consuming part of the filesystem check. The unintialized block group feature can greatly reduce e2fsck time by eliminating checking of uninitialized inodes. With this feature, there is a a high water mark of used inodes for each block group. Block and inode bitmaps can be uninitialized on disk via a flag in the group descriptor to avoid reading or scanning them at e2fsck time. A checksum of each group descriptor is used to ensure that corruption in the group descriptor's bit flags does not cause incorrect operation. The feature is enabled through a mkfs option mke2fs /dev/ -O uninit_groups A patch adding support for uninitialized block groups to e2fsprogs tools has been posted to the linux-ext4 mailing list. The patches have been stress tested with fsstress and fsx. In performance tests testing e2fsck time, we have seen that e2fsck time on ext3 grows linearly with the total number of inodes in the filesytem. In ext4 with the uninitialized block groups feature, the e2fsck time is constant, based solely on the number of used inodes rather than the total inode count. Since typical ext4 filesystems only use 1-10% of their inodes, this feature can greatly reduce e2fsck time for users. With performance improvement of 2-20 times, depending on how full the filesystem is. The attached graph shows the major improvements in e2fsck times in filesystems with a large total inode count, but few inodes in use. In each group descriptor if we have EXT4_BG_INODE_UNINIT set in bg_flags: Inode table is not initialized/used in this group. So we can skip the consistency check during fsck. EXT4_BG_BLOCK_UNINIT set in bg_flags: No block in the group is used. So we can skip the block bitmap verification for this group. We also add two new fields to group descriptor as a part of uninitialized group patch. __le16 bg_itable_unused; /* Unused inodes count */ __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ bg_itable_unused: If we have EXT4_BG_INODE_UNINIT not set in bg_flags then bg_itable_unused will give the offset within the inode table till the inodes are used. This can be used by fsck to skip list of inodes that are marked unused. bg_checksum: Now that we depend on bg_flags and bg_itable_unused to determine the block and inode usage, we need to make sure group descriptor is not corrupt. We add checksum to group descriptor to detect corruption. If the descriptor is found to be corrupt, we mark all the blocks and inodes in the group used. Signed-off-by: Avantika Mathur <mathur@us.ibm.com> Signed-off-by: Andreas Dilger <adilger@clusterfs.com> Signed-off-by: Mingming Cao <cmm@us.ibm.com> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
2007-10-17 06:38:25 +08:00
}
/* Return the number of free blocks in a block group. It is used when
* the block bitmap is uninitialized, so we can't just count the bits
* in the bitmap. */
unsigned ext4_free_clusters_after_init(struct super_block *sb,
ext4_group_t block_group,
struct ext4_group_desc *gdp)
{
return num_clusters_in_group(sb, block_group) -
ext4_num_overhead_clusters(sb, block_group, gdp);
}
Ext4: Uninitialized Block Groups In pass1 of e2fsck, every inode table in the fileystem is scanned and checked, regardless of whether it is in use. This is this the most time consuming part of the filesystem check. The unintialized block group feature can greatly reduce e2fsck time by eliminating checking of uninitialized inodes. With this feature, there is a a high water mark of used inodes for each block group. Block and inode bitmaps can be uninitialized on disk via a flag in the group descriptor to avoid reading or scanning them at e2fsck time. A checksum of each group descriptor is used to ensure that corruption in the group descriptor's bit flags does not cause incorrect operation. The feature is enabled through a mkfs option mke2fs /dev/ -O uninit_groups A patch adding support for uninitialized block groups to e2fsprogs tools has been posted to the linux-ext4 mailing list. The patches have been stress tested with fsstress and fsx. In performance tests testing e2fsck time, we have seen that e2fsck time on ext3 grows linearly with the total number of inodes in the filesytem. In ext4 with the uninitialized block groups feature, the e2fsck time is constant, based solely on the number of used inodes rather than the total inode count. Since typical ext4 filesystems only use 1-10% of their inodes, this feature can greatly reduce e2fsck time for users. With performance improvement of 2-20 times, depending on how full the filesystem is. The attached graph shows the major improvements in e2fsck times in filesystems with a large total inode count, but few inodes in use. In each group descriptor if we have EXT4_BG_INODE_UNINIT set in bg_flags: Inode table is not initialized/used in this group. So we can skip the consistency check during fsck. EXT4_BG_BLOCK_UNINIT set in bg_flags: No block in the group is used. So we can skip the block bitmap verification for this group. We also add two new fields to group descriptor as a part of uninitialized group patch. __le16 bg_itable_unused; /* Unused inodes count */ __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ bg_itable_unused: If we have EXT4_BG_INODE_UNINIT not set in bg_flags then bg_itable_unused will give the offset within the inode table till the inodes are used. This can be used by fsck to skip list of inodes that are marked unused. bg_checksum: Now that we depend on bg_flags and bg_itable_unused to determine the block and inode usage, we need to make sure group descriptor is not corrupt. We add checksum to group descriptor to detect corruption. If the descriptor is found to be corrupt, we mark all the blocks and inodes in the group used. Signed-off-by: Avantika Mathur <mathur@us.ibm.com> Signed-off-by: Andreas Dilger <adilger@clusterfs.com> Signed-off-by: Mingming Cao <cmm@us.ibm.com> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
2007-10-17 06:38:25 +08:00
/*
* The free blocks are managed by bitmaps. A file system contains several
* blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
* block for inodes, N blocks for the inode table and data blocks.
*
* The file system contains group descriptors which are located after the
* super block. Each descriptor contains the number of the bitmap block and
* the free blocks count in the block. The descriptors are loaded in memory
* when a file system is mounted (see ext4_fill_super).
*/
/**
* ext4_get_group_desc() -- load group descriptor from disk
* @sb: super block
* @block_group: given block group
* @bh: pointer to the buffer head to store the block
* group descriptor
*/
struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
ext4_group_t block_group,
struct buffer_head **bh)
{
unsigned int group_desc;
unsigned int offset;
ext4_group_t ngroups = ext4_get_groups_count(sb);
struct ext4_group_desc *desc;
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (block_group >= ngroups) {
ext4_error(sb, "block_group >= groups_count - block_group = %u,"
" groups_count = %u", block_group, ngroups);
return NULL;
}
group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
if (!sbi->s_group_desc[group_desc]) {
ext4_error(sb, "Group descriptor not loaded - "
"block_group = %u, group_desc = %u, desc = %u",
block_group, group_desc, offset);
return NULL;
}
desc = (struct ext4_group_desc *)(
(__u8 *)sbi->s_group_desc[group_desc]->b_data +
offset * EXT4_DESC_SIZE(sb));
if (bh)
*bh = sbi->s_group_desc[group_desc];
return desc;
}
/*
* Return the block number which was discovered to be invalid, or 0 if
* the block bitmap is valid.
*/
static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
struct ext4_group_desc *desc,
ext4_group_t block_group,
struct buffer_head *bh)
{
ext4_grpblk_t offset;
ext4_grpblk_t next_zero_bit;
ext4_fsblk_t blk;
ext4_fsblk_t group_first_block;
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
/* with FLEX_BG, the inode/block bitmaps and itable
* blocks may not be in the group at all
* so the bitmap validation will be skipped for those groups
* or it has to also read the block group where the bitmaps
* are located to verify they are set.
*/
return 0;
}
group_first_block = ext4_group_first_block_no(sb, block_group);
/* check whether block bitmap block number is set */
blk = ext4_block_bitmap(sb, desc);
offset = blk - group_first_block;
if (!ext4_test_bit(offset, bh->b_data))
/* bad block bitmap */
return blk;
/* check whether the inode bitmap block number is set */
blk = ext4_inode_bitmap(sb, desc);
offset = blk - group_first_block;
if (!ext4_test_bit(offset, bh->b_data))
/* bad block bitmap */
return blk;
/* check whether the inode table block number is set */
blk = ext4_inode_table(sb, desc);
offset = blk - group_first_block;
next_zero_bit = ext4_find_next_zero_bit(bh->b_data,
offset + EXT4_SB(sb)->s_itb_per_group,
offset);
if (next_zero_bit < offset + EXT4_SB(sb)->s_itb_per_group)
/* bad bitmap for inode tables */
return blk;
return 0;
}
void ext4_validate_block_bitmap(struct super_block *sb,
struct ext4_group_desc *desc,
ext4_group_t block_group,
struct buffer_head *bh)
{
ext4_fsblk_t blk;
ext4: mark block group as corrupt on block bitmap error When we notice a block-bitmap corruption (because of device failure or something else), we should mark this group as corrupt and prevent further block allocations/deallocations from it. Currently, we end up generating one error message for every block in the bitmap. This potentially could make the system unstable as noticed in some bugs. With this patch, the error will be printed only the first time and mark the entire block group as corrupted. This prevents future access allocations/deallocations from it. Also tested by corrupting the block bitmap and forcefully introducing the mb_free_blocks error: (1) create a largefile (2Gb) $ dd if=/dev/zero of=largefile oflag=direct bs=10485760 count=200 (2) umount filesystem. use dumpe2fs to see which block-bitmaps are in use by largefile and note their block numbers (3) use dd to zero-out the used block bitmaps $ dd if=/dev/zero of=/dev/hdc4 bs=4096 seek=14 count=8 oflag=direct (4) mount the FS and delete the largefile. (5) recreate the largefile. verify that the new largefile does not get any blocks from the groups marked as bad. Without the patch, we will see mb_free_blocks error for each bit in each zero'ed out bitmap at (4). With the patch, we only see the error once per blockgroup: [ 309.706803] EXT4-fs error (device sdb4): ext4_mb_generate_buddy:735: group 15: 32768 clusters in bitmap, 0 in gd. blk grp corrupted. [ 309.720824] EXT4-fs error (device sdb4): ext4_mb_generate_buddy:735: group 14: 32768 clusters in bitmap, 0 in gd. blk grp corrupted. [ 309.732858] EXT4-fs error (device sdb4) in ext4_free_blocks:4802: IO failure [ 309.748321] EXT4-fs error (device sdb4): ext4_mb_generate_buddy:735: group 13: 32768 clusters in bitmap, 0 in gd. blk grp corrupted. [ 309.760331] EXT4-fs error (device sdb4) in ext4_free_blocks:4802: IO failure [ 309.769695] EXT4-fs error (device sdb4): ext4_mb_generate_buddy:735: group 12: 32768 clusters in bitmap, 0 in gd. blk grp corrupted. [ 309.781721] EXT4-fs error (device sdb4) in ext4_free_blocks:4802: IO failure [ 309.798166] EXT4-fs error (device sdb4): ext4_mb_generate_buddy:735: group 11: 32768 clusters in bitmap, 0 in gd. blk grp corrupted. [ 309.810184] EXT4-fs error (device sdb4) in ext4_free_blocks:4802: IO failure [ 309.819532] EXT4-fs error (device sdb4): ext4_mb_generate_buddy:735: group 10: 32768 clusters in bitmap, 0 in gd. blk grp corrupted. Google-Bug-Id: 7258357 [darrick.wong@oracle.com] Further modifications (by Darrick) to make more obvious that this corruption bit applies to blocks only. Set the corruption flag if the block group bitmap verification fails. Original-author: Aditya Kali <adityakali@google.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
2013-08-29 05:35:51 +08:00
struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
if (buffer_verified(bh))
return;
ext4_lock_group(sb, block_group);
blk = ext4_valid_block_bitmap(sb, desc, block_group, bh);
if (unlikely(blk != 0)) {
ext4_unlock_group(sb, block_group);
ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
block_group, blk);
ext4: mark block group as corrupt on block bitmap error When we notice a block-bitmap corruption (because of device failure or something else), we should mark this group as corrupt and prevent further block allocations/deallocations from it. Currently, we end up generating one error message for every block in the bitmap. This potentially could make the system unstable as noticed in some bugs. With this patch, the error will be printed only the first time and mark the entire block group as corrupted. This prevents future access allocations/deallocations from it. Also tested by corrupting the block bitmap and forcefully introducing the mb_free_blocks error: (1) create a largefile (2Gb) $ dd if=/dev/zero of=largefile oflag=direct bs=10485760 count=200 (2) umount filesystem. use dumpe2fs to see which block-bitmaps are in use by largefile and note their block numbers (3) use dd to zero-out the used block bitmaps $ dd if=/dev/zero of=/dev/hdc4 bs=4096 seek=14 count=8 oflag=direct (4) mount the FS and delete the largefile. (5) recreate the largefile. verify that the new largefile does not get any blocks from the groups marked as bad. Without the patch, we will see mb_free_blocks error for each bit in each zero'ed out bitmap at (4). With the patch, we only see the error once per blockgroup: [ 309.706803] EXT4-fs error (device sdb4): ext4_mb_generate_buddy:735: group 15: 32768 clusters in bitmap, 0 in gd. blk grp corrupted. [ 309.720824] EXT4-fs error (device sdb4): ext4_mb_generate_buddy:735: group 14: 32768 clusters in bitmap, 0 in gd. blk grp corrupted. [ 309.732858] EXT4-fs error (device sdb4) in ext4_free_blocks:4802: IO failure [ 309.748321] EXT4-fs error (device sdb4): ext4_mb_generate_buddy:735: group 13: 32768 clusters in bitmap, 0 in gd. blk grp corrupted. [ 309.760331] EXT4-fs error (device sdb4) in ext4_free_blocks:4802: IO failure [ 309.769695] EXT4-fs error (device sdb4): ext4_mb_generate_buddy:735: group 12: 32768 clusters in bitmap, 0 in gd. blk grp corrupted. [ 309.781721] EXT4-fs error (device sdb4) in ext4_free_blocks:4802: IO failure [ 309.798166] EXT4-fs error (device sdb4): ext4_mb_generate_buddy:735: group 11: 32768 clusters in bitmap, 0 in gd. blk grp corrupted. [ 309.810184] EXT4-fs error (device sdb4) in ext4_free_blocks:4802: IO failure [ 309.819532] EXT4-fs error (device sdb4): ext4_mb_generate_buddy:735: group 10: 32768 clusters in bitmap, 0 in gd. blk grp corrupted. Google-Bug-Id: 7258357 [darrick.wong@oracle.com] Further modifications (by Darrick) to make more obvious that this corruption bit applies to blocks only. Set the corruption flag if the block group bitmap verification fails. Original-author: Aditya Kali <adityakali@google.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
2013-08-29 05:35:51 +08:00
set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
return;
}
if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
desc, bh))) {
ext4_unlock_group(sb, block_group);
ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
ext4: mark block group as corrupt on block bitmap error When we notice a block-bitmap corruption (because of device failure or something else), we should mark this group as corrupt and prevent further block allocations/deallocations from it. Currently, we end up generating one error message for every block in the bitmap. This potentially could make the system unstable as noticed in some bugs. With this patch, the error will be printed only the first time and mark the entire block group as corrupted. This prevents future access allocations/deallocations from it. Also tested by corrupting the block bitmap and forcefully introducing the mb_free_blocks error: (1) create a largefile (2Gb) $ dd if=/dev/zero of=largefile oflag=direct bs=10485760 count=200 (2) umount filesystem. use dumpe2fs to see which block-bitmaps are in use by largefile and note their block numbers (3) use dd to zero-out the used block bitmaps $ dd if=/dev/zero of=/dev/hdc4 bs=4096 seek=14 count=8 oflag=direct (4) mount the FS and delete the largefile. (5) recreate the largefile. verify that the new largefile does not get any blocks from the groups marked as bad. Without the patch, we will see mb_free_blocks error for each bit in each zero'ed out bitmap at (4). With the patch, we only see the error once per blockgroup: [ 309.706803] EXT4-fs error (device sdb4): ext4_mb_generate_buddy:735: group 15: 32768 clusters in bitmap, 0 in gd. blk grp corrupted. [ 309.720824] EXT4-fs error (device sdb4): ext4_mb_generate_buddy:735: group 14: 32768 clusters in bitmap, 0 in gd. blk grp corrupted. [ 309.732858] EXT4-fs error (device sdb4) in ext4_free_blocks:4802: IO failure [ 309.748321] EXT4-fs error (device sdb4): ext4_mb_generate_buddy:735: group 13: 32768 clusters in bitmap, 0 in gd. blk grp corrupted. [ 309.760331] EXT4-fs error (device sdb4) in ext4_free_blocks:4802: IO failure [ 309.769695] EXT4-fs error (device sdb4): ext4_mb_generate_buddy:735: group 12: 32768 clusters in bitmap, 0 in gd. blk grp corrupted. [ 309.781721] EXT4-fs error (device sdb4) in ext4_free_blocks:4802: IO failure [ 309.798166] EXT4-fs error (device sdb4): ext4_mb_generate_buddy:735: group 11: 32768 clusters in bitmap, 0 in gd. blk grp corrupted. [ 309.810184] EXT4-fs error (device sdb4) in ext4_free_blocks:4802: IO failure [ 309.819532] EXT4-fs error (device sdb4): ext4_mb_generate_buddy:735: group 10: 32768 clusters in bitmap, 0 in gd. blk grp corrupted. Google-Bug-Id: 7258357 [darrick.wong@oracle.com] Further modifications (by Darrick) to make more obvious that this corruption bit applies to blocks only. Set the corruption flag if the block group bitmap verification fails. Original-author: Aditya Kali <adityakali@google.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
2013-08-29 05:35:51 +08:00
set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
return;
}
set_buffer_verified(bh);
ext4_unlock_group(sb, block_group);
}
/**
ext4: check bh in ext4_read_block_bitmap() Validate the bh pointer before using it, since ext4_read_block_bitmap_nowait() might return NULL. I've seen this in fsfuzz testing. EXT4-fs error (device loop0): ext4_read_block_bitmap_nowait:385: comm touch: Cannot get buffer for block bitmap - block_group = 0, block_bitmap = 3925999616 BUG: unable to handle kernel NULL pointer dereference at (null) IP: [<ffffffff8121de25>] ext4_wait_block_bitmap+0x25/0xe0 ... Call Trace: [<ffffffff8121e1e5>] ext4_read_block_bitmap+0x35/0x60 [<ffffffff8125e9c6>] ext4_free_blocks+0x236/0xb80 [<ffffffff811d0d36>] ? __getblk+0x36/0x70 [<ffffffff811d0a5f>] ? __find_get_block+0x8f/0x210 [<ffffffff81191ef3>] ? kmem_cache_free+0x33/0x140 [<ffffffff812678e5>] ext4_xattr_release_block+0x1b5/0x1d0 [<ffffffff812679be>] ext4_xattr_delete_inode+0xbe/0x100 [<ffffffff81222a7c>] ext4_free_inode+0x7c/0x4d0 [<ffffffff812277b8>] ? ext4_mark_inode_dirty+0x88/0x230 [<ffffffff8122993c>] ext4_evict_inode+0x32c/0x490 [<ffffffff811b8cd7>] evict+0xa7/0x1c0 [<ffffffff811b8ed3>] iput_final+0xe3/0x170 [<ffffffff811b8f9e>] iput+0x3e/0x50 [<ffffffff812316fd>] ext4_add_nondir+0x4d/0x90 [<ffffffff81231d0b>] ext4_create+0xeb/0x170 [<ffffffff811aae9c>] vfs_create+0xac/0xd0 [<ffffffff811ac845>] lookup_open+0x185/0x1c0 [<ffffffff8129e3b9>] ? selinux_inode_permission+0xa9/0x170 [<ffffffff811acb54>] do_last+0x2d4/0x7a0 [<ffffffff811af743>] path_openat+0xb3/0x480 [<ffffffff8116a8a1>] ? handle_mm_fault+0x251/0x3b0 [<ffffffff811afc49>] do_filp_open+0x49/0xa0 [<ffffffff811bbaad>] ? __alloc_fd+0xdd/0x150 [<ffffffff8119da28>] do_sys_open+0x108/0x1f0 [<ffffffff8119db51>] sys_open+0x21/0x30 [<ffffffff81618959>] system_call_fastpath+0x16/0x1b Also fix comment for ext4_read_block_bitmap_nowait() Signed-off-by: Eryu Guan <guaneryu@gmail.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> Cc: stable@vger.kernel.org
2013-01-13 05:33:25 +08:00
* ext4_read_block_bitmap_nowait()
* @sb: super block
* @block_group: given block group
*
* Read the bitmap for a given block_group,and validate the
* bits for block/inode/inode tables are set in the bitmaps
*
* Return buffer_head on success or NULL in case of failure.
*/
Ext4: Uninitialized Block Groups In pass1 of e2fsck, every inode table in the fileystem is scanned and checked, regardless of whether it is in use. This is this the most time consuming part of the filesystem check. The unintialized block group feature can greatly reduce e2fsck time by eliminating checking of uninitialized inodes. With this feature, there is a a high water mark of used inodes for each block group. Block and inode bitmaps can be uninitialized on disk via a flag in the group descriptor to avoid reading or scanning them at e2fsck time. A checksum of each group descriptor is used to ensure that corruption in the group descriptor's bit flags does not cause incorrect operation. The feature is enabled through a mkfs option mke2fs /dev/ -O uninit_groups A patch adding support for uninitialized block groups to e2fsprogs tools has been posted to the linux-ext4 mailing list. The patches have been stress tested with fsstress and fsx. In performance tests testing e2fsck time, we have seen that e2fsck time on ext3 grows linearly with the total number of inodes in the filesytem. In ext4 with the uninitialized block groups feature, the e2fsck time is constant, based solely on the number of used inodes rather than the total inode count. Since typical ext4 filesystems only use 1-10% of their inodes, this feature can greatly reduce e2fsck time for users. With performance improvement of 2-20 times, depending on how full the filesystem is. The attached graph shows the major improvements in e2fsck times in filesystems with a large total inode count, but few inodes in use. In each group descriptor if we have EXT4_BG_INODE_UNINIT set in bg_flags: Inode table is not initialized/used in this group. So we can skip the consistency check during fsck. EXT4_BG_BLOCK_UNINIT set in bg_flags: No block in the group is used. So we can skip the block bitmap verification for this group. We also add two new fields to group descriptor as a part of uninitialized group patch. __le16 bg_itable_unused; /* Unused inodes count */ __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ bg_itable_unused: If we have EXT4_BG_INODE_UNINIT not set in bg_flags then bg_itable_unused will give the offset within the inode table till the inodes are used. This can be used by fsck to skip list of inodes that are marked unused. bg_checksum: Now that we depend on bg_flags and bg_itable_unused to determine the block and inode usage, we need to make sure group descriptor is not corrupt. We add checksum to group descriptor to detect corruption. If the descriptor is found to be corrupt, we mark all the blocks and inodes in the group used. Signed-off-by: Avantika Mathur <mathur@us.ibm.com> Signed-off-by: Andreas Dilger <adilger@clusterfs.com> Signed-off-by: Mingming Cao <cmm@us.ibm.com> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
2007-10-17 06:38:25 +08:00
struct buffer_head *
ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
{
struct ext4_group_desc *desc;
struct buffer_head *bh;
ext4_fsblk_t bitmap_blk;
Ext4: Uninitialized Block Groups In pass1 of e2fsck, every inode table in the fileystem is scanned and checked, regardless of whether it is in use. This is this the most time consuming part of the filesystem check. The unintialized block group feature can greatly reduce e2fsck time by eliminating checking of uninitialized inodes. With this feature, there is a a high water mark of used inodes for each block group. Block and inode bitmaps can be uninitialized on disk via a flag in the group descriptor to avoid reading or scanning them at e2fsck time. A checksum of each group descriptor is used to ensure that corruption in the group descriptor's bit flags does not cause incorrect operation. The feature is enabled through a mkfs option mke2fs /dev/ -O uninit_groups A patch adding support for uninitialized block groups to e2fsprogs tools has been posted to the linux-ext4 mailing list. The patches have been stress tested with fsstress and fsx. In performance tests testing e2fsck time, we have seen that e2fsck time on ext3 grows linearly with the total number of inodes in the filesytem. In ext4 with the uninitialized block groups feature, the e2fsck time is constant, based solely on the number of used inodes rather than the total inode count. Since typical ext4 filesystems only use 1-10% of their inodes, this feature can greatly reduce e2fsck time for users. With performance improvement of 2-20 times, depending on how full the filesystem is. The attached graph shows the major improvements in e2fsck times in filesystems with a large total inode count, but few inodes in use. In each group descriptor if we have EXT4_BG_INODE_UNINIT set in bg_flags: Inode table is not initialized/used in this group. So we can skip the consistency check during fsck. EXT4_BG_BLOCK_UNINIT set in bg_flags: No block in the group is used. So we can skip the block bitmap verification for this group. We also add two new fields to group descriptor as a part of uninitialized group patch. __le16 bg_itable_unused; /* Unused inodes count */ __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ bg_itable_unused: If we have EXT4_BG_INODE_UNINIT not set in bg_flags then bg_itable_unused will give the offset within the inode table till the inodes are used. This can be used by fsck to skip list of inodes that are marked unused. bg_checksum: Now that we depend on bg_flags and bg_itable_unused to determine the block and inode usage, we need to make sure group descriptor is not corrupt. We add checksum to group descriptor to detect corruption. If the descriptor is found to be corrupt, we mark all the blocks and inodes in the group used. Signed-off-by: Avantika Mathur <mathur@us.ibm.com> Signed-off-by: Andreas Dilger <adilger@clusterfs.com> Signed-off-by: Mingming Cao <cmm@us.ibm.com> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
2007-10-17 06:38:25 +08:00
desc = ext4_get_group_desc(sb, block_group, NULL);
if (!desc)
return NULL;
bitmap_blk = ext4_block_bitmap(sb, desc);
bh = sb_getblk(sb, bitmap_blk);
if (unlikely(!bh)) {
ext4_error(sb, "Cannot get buffer for block bitmap - "
"block_group = %u, block_bitmap = %llu",
block_group, bitmap_blk);
return NULL;
}
if (bitmap_uptodate(bh))
goto verify;
lock_buffer(bh);
if (bitmap_uptodate(bh)) {
unlock_buffer(bh);
goto verify;
}
ext4_lock_group(sb, block_group);
Ext4: Uninitialized Block Groups In pass1 of e2fsck, every inode table in the fileystem is scanned and checked, regardless of whether it is in use. This is this the most time consuming part of the filesystem check. The unintialized block group feature can greatly reduce e2fsck time by eliminating checking of uninitialized inodes. With this feature, there is a a high water mark of used inodes for each block group. Block and inode bitmaps can be uninitialized on disk via a flag in the group descriptor to avoid reading or scanning them at e2fsck time. A checksum of each group descriptor is used to ensure that corruption in the group descriptor's bit flags does not cause incorrect operation. The feature is enabled through a mkfs option mke2fs /dev/ -O uninit_groups A patch adding support for uninitialized block groups to e2fsprogs tools has been posted to the linux-ext4 mailing list. The patches have been stress tested with fsstress and fsx. In performance tests testing e2fsck time, we have seen that e2fsck time on ext3 grows linearly with the total number of inodes in the filesytem. In ext4 with the uninitialized block groups feature, the e2fsck time is constant, based solely on the number of used inodes rather than the total inode count. Since typical ext4 filesystems only use 1-10% of their inodes, this feature can greatly reduce e2fsck time for users. With performance improvement of 2-20 times, depending on how full the filesystem is. The attached graph shows the major improvements in e2fsck times in filesystems with a large total inode count, but few inodes in use. In each group descriptor if we have EXT4_BG_INODE_UNINIT set in bg_flags: Inode table is not initialized/used in this group. So we can skip the consistency check during fsck. EXT4_BG_BLOCK_UNINIT set in bg_flags: No block in the group is used. So we can skip the block bitmap verification for this group. We also add two new fields to group descriptor as a part of uninitialized group patch. __le16 bg_itable_unused; /* Unused inodes count */ __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ bg_itable_unused: If we have EXT4_BG_INODE_UNINIT not set in bg_flags then bg_itable_unused will give the offset within the inode table till the inodes are used. This can be used by fsck to skip list of inodes that are marked unused. bg_checksum: Now that we depend on bg_flags and bg_itable_unused to determine the block and inode usage, we need to make sure group descriptor is not corrupt. We add checksum to group descriptor to detect corruption. If the descriptor is found to be corrupt, we mark all the blocks and inodes in the group used. Signed-off-by: Avantika Mathur <mathur@us.ibm.com> Signed-off-by: Andreas Dilger <adilger@clusterfs.com> Signed-off-by: Mingming Cao <cmm@us.ibm.com> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
2007-10-17 06:38:25 +08:00
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
ext4_init_block_bitmap(sb, bh, block_group, desc);
set_bitmap_uptodate(bh);
set_buffer_uptodate(bh);
ext4_unlock_group(sb, block_group);
unlock_buffer(bh);
return bh;
Ext4: Uninitialized Block Groups In pass1 of e2fsck, every inode table in the fileystem is scanned and checked, regardless of whether it is in use. This is this the most time consuming part of the filesystem check. The unintialized block group feature can greatly reduce e2fsck time by eliminating checking of uninitialized inodes. With this feature, there is a a high water mark of used inodes for each block group. Block and inode bitmaps can be uninitialized on disk via a flag in the group descriptor to avoid reading or scanning them at e2fsck time. A checksum of each group descriptor is used to ensure that corruption in the group descriptor's bit flags does not cause incorrect operation. The feature is enabled through a mkfs option mke2fs /dev/ -O uninit_groups A patch adding support for uninitialized block groups to e2fsprogs tools has been posted to the linux-ext4 mailing list. The patches have been stress tested with fsstress and fsx. In performance tests testing e2fsck time, we have seen that e2fsck time on ext3 grows linearly with the total number of inodes in the filesytem. In ext4 with the uninitialized block groups feature, the e2fsck time is constant, based solely on the number of used inodes rather than the total inode count. Since typical ext4 filesystems only use 1-10% of their inodes, this feature can greatly reduce e2fsck time for users. With performance improvement of 2-20 times, depending on how full the filesystem is. The attached graph shows the major improvements in e2fsck times in filesystems with a large total inode count, but few inodes in use. In each group descriptor if we have EXT4_BG_INODE_UNINIT set in bg_flags: Inode table is not initialized/used in this group. So we can skip the consistency check during fsck. EXT4_BG_BLOCK_UNINIT set in bg_flags: No block in the group is used. So we can skip the block bitmap verification for this group. We also add two new fields to group descriptor as a part of uninitialized group patch. __le16 bg_itable_unused; /* Unused inodes count */ __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ bg_itable_unused: If we have EXT4_BG_INODE_UNINIT not set in bg_flags then bg_itable_unused will give the offset within the inode table till the inodes are used. This can be used by fsck to skip list of inodes that are marked unused. bg_checksum: Now that we depend on bg_flags and bg_itable_unused to determine the block and inode usage, we need to make sure group descriptor is not corrupt. We add checksum to group descriptor to detect corruption. If the descriptor is found to be corrupt, we mark all the blocks and inodes in the group used. Signed-off-by: Avantika Mathur <mathur@us.ibm.com> Signed-off-by: Andreas Dilger <adilger@clusterfs.com> Signed-off-by: Mingming Cao <cmm@us.ibm.com> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
2007-10-17 06:38:25 +08:00
}
ext4_unlock_group(sb, block_group);
if (buffer_uptodate(bh)) {
/*
* if not uninit if bh is uptodate,
* bitmap is also uptodate
*/
set_bitmap_uptodate(bh);
unlock_buffer(bh);
goto verify;
}
/*
* submit the buffer_head for reading
*/
set_buffer_new(bh);
trace_ext4_read_block_bitmap_load(sb, block_group);
bh->b_end_io = ext4_end_bitmap_read;
get_bh(bh);
submit_bh(READ | REQ_META | REQ_PRIO, bh);
return bh;
verify:
ext4_validate_block_bitmap(sb, desc, block_group, bh);
if (buffer_verified(bh))
return bh;
put_bh(bh);
return NULL;
}
/* Returns 0 on success, 1 on error */
int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
struct buffer_head *bh)
{
struct ext4_group_desc *desc;
if (!buffer_new(bh))
return 0;
desc = ext4_get_group_desc(sb, block_group, NULL);
if (!desc)
return 1;
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
ext4_error(sb, "Cannot read block bitmap - "
"block_group = %u, block_bitmap = %llu",
block_group, (unsigned long long) bh->b_blocknr);
return 1;
}
clear_buffer_new(bh);
/* Panic or remount fs read-only if block bitmap is invalid */
ext4_validate_block_bitmap(sb, desc, block_group, bh);
/* ...but check for error just in case errors=continue. */
return !buffer_verified(bh);
}
struct buffer_head *
ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
{
struct buffer_head *bh;
bh = ext4_read_block_bitmap_nowait(sb, block_group);
ext4: check bh in ext4_read_block_bitmap() Validate the bh pointer before using it, since ext4_read_block_bitmap_nowait() might return NULL. I've seen this in fsfuzz testing. EXT4-fs error (device loop0): ext4_read_block_bitmap_nowait:385: comm touch: Cannot get buffer for block bitmap - block_group = 0, block_bitmap = 3925999616 BUG: unable to handle kernel NULL pointer dereference at (null) IP: [<ffffffff8121de25>] ext4_wait_block_bitmap+0x25/0xe0 ... Call Trace: [<ffffffff8121e1e5>] ext4_read_block_bitmap+0x35/0x60 [<ffffffff8125e9c6>] ext4_free_blocks+0x236/0xb80 [<ffffffff811d0d36>] ? __getblk+0x36/0x70 [<ffffffff811d0a5f>] ? __find_get_block+0x8f/0x210 [<ffffffff81191ef3>] ? kmem_cache_free+0x33/0x140 [<ffffffff812678e5>] ext4_xattr_release_block+0x1b5/0x1d0 [<ffffffff812679be>] ext4_xattr_delete_inode+0xbe/0x100 [<ffffffff81222a7c>] ext4_free_inode+0x7c/0x4d0 [<ffffffff812277b8>] ? ext4_mark_inode_dirty+0x88/0x230 [<ffffffff8122993c>] ext4_evict_inode+0x32c/0x490 [<ffffffff811b8cd7>] evict+0xa7/0x1c0 [<ffffffff811b8ed3>] iput_final+0xe3/0x170 [<ffffffff811b8f9e>] iput+0x3e/0x50 [<ffffffff812316fd>] ext4_add_nondir+0x4d/0x90 [<ffffffff81231d0b>] ext4_create+0xeb/0x170 [<ffffffff811aae9c>] vfs_create+0xac/0xd0 [<ffffffff811ac845>] lookup_open+0x185/0x1c0 [<ffffffff8129e3b9>] ? selinux_inode_permission+0xa9/0x170 [<ffffffff811acb54>] do_last+0x2d4/0x7a0 [<ffffffff811af743>] path_openat+0xb3/0x480 [<ffffffff8116a8a1>] ? handle_mm_fault+0x251/0x3b0 [<ffffffff811afc49>] do_filp_open+0x49/0xa0 [<ffffffff811bbaad>] ? __alloc_fd+0xdd/0x150 [<ffffffff8119da28>] do_sys_open+0x108/0x1f0 [<ffffffff8119db51>] sys_open+0x21/0x30 [<ffffffff81618959>] system_call_fastpath+0x16/0x1b Also fix comment for ext4_read_block_bitmap_nowait() Signed-off-by: Eryu Guan <guaneryu@gmail.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> Cc: stable@vger.kernel.org
2013-01-13 05:33:25 +08:00
if (!bh)
return NULL;
if (ext4_wait_block_bitmap(sb, block_group, bh)) {
put_bh(bh);
return NULL;
}
return bh;
}
/**
* ext4_has_free_clusters()
* @sbi: in-core super block structure.
* @nclusters: number of needed blocks
* @flags: flags from ext4_mb_new_blocks()
*
* Check if filesystem has nclusters free & available for allocation.
* On success return 1, return 0 on failure.
*/
static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
s64 nclusters, unsigned int flags)
{
ext4: introduce reserved space Currently in ENOSPC condition when writing into unwritten space, or punching a hole, we might need to split the extent and grow extent tree. However since we can not allocate any new metadata blocks we'll have to zero out unwritten part of extent or punched out part of extent, or in the worst case return ENOSPC even though use actually does not allocate any space. Also in delalloc path we do reserve metadata and data blocks for the time we're going to write out, however metadata block reservation is very tricky especially since we expect that logical connectivity implies physical connectivity, however that might not be the case and hence we might end up allocating more metadata blocks than previously reserved. So in future, metadata reservation checks should be removed since we can not assure that we do not under reserve. And this is where reserved space comes into the picture. When mounting the file system we slice off a little bit of the file system space (2% or 4096 clusters, whichever is smaller) which can be then used for the cases mentioned above to prevent costly zeroout, or unexpected ENOSPC. The number of reserved clusters can be set via sysfs, however it can never be bigger than number of free clusters in the file system. Note that this patch fixes the failure of xfstest 274 as expected. Signed-off-by: Lukas Czerner <lczerner@redhat.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
2013-04-10 10:11:22 +08:00
s64 free_clusters, dirty_clusters, rsv, resv_clusters;
struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;
free_clusters = percpu_counter_read_positive(fcc);
dirty_clusters = percpu_counter_read_positive(dcc);
ext4: introduce reserved space Currently in ENOSPC condition when writing into unwritten space, or punching a hole, we might need to split the extent and grow extent tree. However since we can not allocate any new metadata blocks we'll have to zero out unwritten part of extent or punched out part of extent, or in the worst case return ENOSPC even though use actually does not allocate any space. Also in delalloc path we do reserve metadata and data blocks for the time we're going to write out, however metadata block reservation is very tricky especially since we expect that logical connectivity implies physical connectivity, however that might not be the case and hence we might end up allocating more metadata blocks than previously reserved. So in future, metadata reservation checks should be removed since we can not assure that we do not under reserve. And this is where reserved space comes into the picture. When mounting the file system we slice off a little bit of the file system space (2% or 4096 clusters, whichever is smaller) which can be then used for the cases mentioned above to prevent costly zeroout, or unexpected ENOSPC. The number of reserved clusters can be set via sysfs, however it can never be bigger than number of free clusters in the file system. Note that this patch fixes the failure of xfstest 274 as expected. Signed-off-by: Lukas Czerner <lczerner@redhat.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
2013-04-10 10:11:22 +08:00
resv_clusters = atomic64_read(&sbi->s_resv_clusters);
/*
* r_blocks_count should always be multiple of the cluster ratio so
* we are safe to do a plane bit shift only.
*/
ext4: introduce reserved space Currently in ENOSPC condition when writing into unwritten space, or punching a hole, we might need to split the extent and grow extent tree. However since we can not allocate any new metadata blocks we'll have to zero out unwritten part of extent or punched out part of extent, or in the worst case return ENOSPC even though use actually does not allocate any space. Also in delalloc path we do reserve metadata and data blocks for the time we're going to write out, however metadata block reservation is very tricky especially since we expect that logical connectivity implies physical connectivity, however that might not be the case and hence we might end up allocating more metadata blocks than previously reserved. So in future, metadata reservation checks should be removed since we can not assure that we do not under reserve. And this is where reserved space comes into the picture. When mounting the file system we slice off a little bit of the file system space (2% or 4096 clusters, whichever is smaller) which can be then used for the cases mentioned above to prevent costly zeroout, or unexpected ENOSPC. The number of reserved clusters can be set via sysfs, however it can never be bigger than number of free clusters in the file system. Note that this patch fixes the failure of xfstest 274 as expected. Signed-off-by: Lukas Czerner <lczerner@redhat.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
2013-04-10 10:11:22 +08:00
rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) +
resv_clusters;
ext4: introduce reserved space Currently in ENOSPC condition when writing into unwritten space, or punching a hole, we might need to split the extent and grow extent tree. However since we can not allocate any new metadata blocks we'll have to zero out unwritten part of extent or punched out part of extent, or in the worst case return ENOSPC even though use actually does not allocate any space. Also in delalloc path we do reserve metadata and data blocks for the time we're going to write out, however metadata block reservation is very tricky especially since we expect that logical connectivity implies physical connectivity, however that might not be the case and hence we might end up allocating more metadata blocks than previously reserved. So in future, metadata reservation checks should be removed since we can not assure that we do not under reserve. And this is where reserved space comes into the picture. When mounting the file system we slice off a little bit of the file system space (2% or 4096 clusters, whichever is smaller) which can be then used for the cases mentioned above to prevent costly zeroout, or unexpected ENOSPC. The number of reserved clusters can be set via sysfs, however it can never be bigger than number of free clusters in the file system. Note that this patch fixes the failure of xfstest 274 as expected. Signed-off-by: Lukas Czerner <lczerner@redhat.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
2013-04-10 10:11:22 +08:00
if (free_clusters - (nclusters + rsv + dirty_clusters) <
EXT4_FREECLUSTERS_WATERMARK) {
free_clusters = percpu_counter_sum_positive(fcc);
dirty_clusters = percpu_counter_sum_positive(dcc);
}
/* Check whether we have space after accounting for current
* dirty clusters & root reserved clusters.
*/
ext4: introduce reserved space Currently in ENOSPC condition when writing into unwritten space, or punching a hole, we might need to split the extent and grow extent tree. However since we can not allocate any new metadata blocks we'll have to zero out unwritten part of extent or punched out part of extent, or in the worst case return ENOSPC even though use actually does not allocate any space. Also in delalloc path we do reserve metadata and data blocks for the time we're going to write out, however metadata block reservation is very tricky especially since we expect that logical connectivity implies physical connectivity, however that might not be the case and hence we might end up allocating more metadata blocks than previously reserved. So in future, metadata reservation checks should be removed since we can not assure that we do not under reserve. And this is where reserved space comes into the picture. When mounting the file system we slice off a little bit of the file system space (2% or 4096 clusters, whichever is smaller) which can be then used for the cases mentioned above to prevent costly zeroout, or unexpected ENOSPC. The number of reserved clusters can be set via sysfs, however it can never be bigger than number of free clusters in the file system. Note that this patch fixes the failure of xfstest 274 as expected. Signed-off-by: Lukas Czerner <lczerner@redhat.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
2013-04-10 10:11:22 +08:00
if (free_clusters >= (rsv + nclusters + dirty_clusters))
return 1;
/* Hm, nope. Are (enough) root reserved clusters available? */
if (uid_eq(sbi->s_resuid, current_fsuid()) ||
(!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
capable(CAP_SYS_RESOURCE) ||
ext4: introduce reserved space Currently in ENOSPC condition when writing into unwritten space, or punching a hole, we might need to split the extent and grow extent tree. However since we can not allocate any new metadata blocks we'll have to zero out unwritten part of extent or punched out part of extent, or in the worst case return ENOSPC even though use actually does not allocate any space. Also in delalloc path we do reserve metadata and data blocks for the time we're going to write out, however metadata block reservation is very tricky especially since we expect that logical connectivity implies physical connectivity, however that might not be the case and hence we might end up allocating more metadata blocks than previously reserved. So in future, metadata reservation checks should be removed since we can not assure that we do not under reserve. And this is where reserved space comes into the picture. When mounting the file system we slice off a little bit of the file system space (2% or 4096 clusters, whichever is smaller) which can be then used for the cases mentioned above to prevent costly zeroout, or unexpected ENOSPC. The number of reserved clusters can be set via sysfs, however it can never be bigger than number of free clusters in the file system. Note that this patch fixes the failure of xfstest 274 as expected. Signed-off-by: Lukas Czerner <lczerner@redhat.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
2013-04-10 10:11:22 +08:00
(flags & EXT4_MB_USE_ROOT_BLOCKS)) {
ext4: introduce reserved space Currently in ENOSPC condition when writing into unwritten space, or punching a hole, we might need to split the extent and grow extent tree. However since we can not allocate any new metadata blocks we'll have to zero out unwritten part of extent or punched out part of extent, or in the worst case return ENOSPC even though use actually does not allocate any space. Also in delalloc path we do reserve metadata and data blocks for the time we're going to write out, however metadata block reservation is very tricky especially since we expect that logical connectivity implies physical connectivity, however that might not be the case and hence we might end up allocating more metadata blocks than previously reserved. So in future, metadata reservation checks should be removed since we can not assure that we do not under reserve. And this is where reserved space comes into the picture. When mounting the file system we slice off a little bit of the file system space (2% or 4096 clusters, whichever is smaller) which can be then used for the cases mentioned above to prevent costly zeroout, or unexpected ENOSPC. The number of reserved clusters can be set via sysfs, however it can never be bigger than number of free clusters in the file system. Note that this patch fixes the failure of xfstest 274 as expected. Signed-off-by: Lukas Czerner <lczerner@redhat.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
2013-04-10 10:11:22 +08:00
if (free_clusters >= (nclusters + dirty_clusters +
resv_clusters))
return 1;
}
/* No free blocks. Let's see if we can dip into reserved pool */
if (flags & EXT4_MB_USE_RESERVED) {
if (free_clusters >= (nclusters + dirty_clusters))
return 1;
}
return 0;
}
int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
s64 nclusters, unsigned int flags)
{
if (ext4_has_free_clusters(sbi, nclusters, flags)) {
percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters);
return 0;
} else
return -ENOSPC;
}
/**
* ext4_should_retry_alloc()
* @sb: super block
* @retries number of attemps has been made
*
* ext4_should_retry_alloc() is called when ENOSPC is returned, and if
* it is profitable to retry the operation, this function will wait
* for the current or committing transaction to complete, and then
* return TRUE.
*
* if the total number of retries exceed three times, return FALSE.
*/
int ext4_should_retry_alloc(struct super_block *sb, int *retries)
{
if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) ||
(*retries)++ > 3 ||
!EXT4_SB(sb)->s_journal)
return 0;
jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
}
/*
* ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
*
* @handle: handle to this transaction
* @inode: file inode
* @goal: given target block(filesystem wide)
* @count: pointer to total number of clusters needed
* @errp: error code
*
* Return 1st allocated block number on success, *count stores total account
* error stores in errp pointer
*/
ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned int flags,
unsigned long *count, int *errp)
{
struct ext4_allocation_request ar;
ext4_fsblk_t ret;
memset(&ar, 0, sizeof(ar));
/* Fill with neighbour allocated blocks */
ar.inode = inode;
ar.goal = goal;
ar.len = count ? *count : 1;
ar.flags = flags;
ret = ext4_mb_new_blocks(handle, &ar, errp);
if (count)
*count = ar.len;
/*
* Account for the allocated meta blocks. We will never
* fail EDQUOT for metdata, but we do account for it.
*/
if (!(*errp) &&
ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) {
spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
dquot_alloc_block_nofail(inode,
EXT4_C2B(EXT4_SB(inode->i_sb), ar.len));
}
return ret;
}
/**
* ext4_count_free_clusters() -- count filesystem free clusters
* @sb: superblock
*
* Adds up the number of free clusters from each block group.
*/
ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
{
ext4_fsblk_t desc_count;
struct ext4_group_desc *gdp;
ext4_group_t i;
ext4_group_t ngroups = ext4_get_groups_count(sb);
struct ext4_group_info *grp;
#ifdef EXT4FS_DEBUG
struct ext4_super_block *es;
ext4_fsblk_t bitmap_count;
unsigned int x;
struct buffer_head *bitmap_bh = NULL;
es = EXT4_SB(sb)->s_es;
desc_count = 0;
bitmap_count = 0;
gdp = NULL;
for (i = 0; i < ngroups; i++) {
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
grp = NULL;
if (EXT4_SB(sb)->s_group_info)
grp = ext4_get_group_info(sb, i);
if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
desc_count += ext4_free_group_clusters(sb, gdp);
brelse(bitmap_bh);
bitmap_bh = ext4_read_block_bitmap(sb, i);
if (bitmap_bh == NULL)
continue;
x = ext4_count_free(bitmap_bh->b_data,
ext4: fix ext4_count_free_clusters() with EXT4FS_DEBUG and bigalloc enabled With bigalloc enabled we must use EXT4_CLUSTERS_PER_GROUP() instead of EXT4_BLOCKS_PER_GROUP() otherwise we will go beyond the allocated buffer. $ mount -t ext4 /dev/vde /vde [ 70.573993] EXT4-fs DEBUG (fs/ext4/mballoc.c, 2346): ext4_mb_alloc_groupinfo: [ 70.575174] allocated s_groupinfo array for 1 meta_bg's [ 70.576172] EXT4-fs DEBUG (fs/ext4/super.c, 2092): ext4_check_descriptors: [ 70.576972] Checking group descriptorsBUG: unable to handle kernel paging request at ffff88006ab56000 [ 72.463686] IP: [<ffffffff81394eb9>] __bitmap_weight+0x2a/0x7f [ 72.464168] PGD 295e067 PUD 2961067 PMD 7fa8e067 PTE 800000006ab56060 [ 72.464738] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC [ 72.465139] Modules linked in: [ 72.465402] CPU: 1 PID: 3560 Comm: mount Tainted: G W 3.14.0-rc2-00069-ge57bce1 #60 [ 72.466079] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 72.466505] task: ffff88007ce6c8a0 ti: ffff88006b7f0000 task.ti: ffff88006b7f0000 [ 72.466505] RIP: 0010:[<ffffffff81394eb9>] [<ffffffff81394eb9>] __bitmap_weight+0x2a/0x7f [ 72.466505] RSP: 0018:ffff88006b7f1c00 EFLAGS: 00010206 [ 72.466505] RAX: 0000000000000000 RBX: 000000000000050a RCX: 0000000000000040 [ 72.466505] RDX: 0000000000000000 RSI: 0000000000080000 RDI: 0000000000000000 [ 72.466505] RBP: ffff88006b7f1c28 R08: 0000000000000002 R09: 0000000000000000 [ 72.466505] R10: 000000000000babe R11: 0000000000000400 R12: 0000000000080000 [ 72.466505] R13: 0000000000000200 R14: 0000000000002000 R15: ffff88006ab55000 [ 72.466505] FS: 00007f43ba1fa840(0000) GS:ffff88007f800000(0000) knlGS:0000000000000000 [ 72.466505] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 72.466505] CR2: ffff88006ab56000 CR3: 000000006b7e6000 CR4: 00000000000006e0 [ 72.466505] Stack: [ 72.466505] ffff88006ab65000 0000000000000000 0000000000000000 0000000000010000 [ 72.466505] ffff88006ab6f400 ffff88006b7f1c58 ffffffff81396bb8 0000000000010000 [ 72.466505] 0000000000000000 ffff88007b869a90 ffff88006a48a000 ffff88006b7f1c70 [ 72.466505] Call Trace: [ 72.466505] [<ffffffff81396bb8>] memweight+0x5f/0x8a [ 72.466505] [<ffffffff811c3b19>] ext4_count_free+0x13/0x21 [ 72.466505] [<ffffffff811c396c>] ext4_count_free_clusters+0xdb/0x171 [ 72.466505] [<ffffffff811e3bdd>] ext4_fill_super+0x117c/0x28ef [ 72.466505] [<ffffffff81391569>] ? vsnprintf+0x1c7/0x3f7 [ 72.466505] [<ffffffff8114d8dc>] mount_bdev+0x145/0x19c [ 72.466505] [<ffffffff811e2a61>] ? ext4_calculate_overhead+0x2a1/0x2a1 [ 72.466505] [<ffffffff811dab1d>] ext4_mount+0x15/0x17 [ 72.466505] [<ffffffff8114e3aa>] mount_fs+0x67/0x150 [ 72.466505] [<ffffffff811637ea>] vfs_kern_mount+0x64/0xde [ 72.466505] [<ffffffff81165d19>] do_mount+0x6fe/0x7f5 [ 72.466505] [<ffffffff81126cc8>] ? strndup_user+0x3a/0xd9 [ 72.466505] [<ffffffff8116604b>] SyS_mount+0x85/0xbe [ 72.466505] [<ffffffff81619e90>] tracesys+0xdd/0xe2 [ 72.466505] Code: c3 89 f0 b9 40 00 00 00 55 99 48 89 e5 41 57 f7 f9 41 56 49 89 ff 41 55 45 31 ed 41 54 41 89 f4 53 31 db 41 89 c6 45 39 ee 7e 10 <4b> 8b 3c ef 49 ff c5 e8 bf ff ff ff 01 c3 eb eb 31 c0 45 85 f6 [ 72.466505] RIP [<ffffffff81394eb9>] __bitmap_weight+0x2a/0x7f [ 72.466505] RSP <ffff88006b7f1c00> [ 72.466505] CR2: ffff88006ab56000 [ 72.466505] ---[ end trace 7d051a08ae138573 ]--- Killed Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
2014-04-15 11:36:15 +08:00
EXT4_CLUSTERS_PER_GROUP(sb) / 8);
printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
i, ext4_free_group_clusters(sb, gdp), x);
bitmap_count += x;
}
brelse(bitmap_bh);
printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu"
", computed = %llu, %llu\n",
EXT4_NUM_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)),
desc_count, bitmap_count);
return bitmap_count;
#else
desc_count = 0;
for (i = 0; i < ngroups; i++) {
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
grp = NULL;
if (EXT4_SB(sb)->s_group_info)
grp = ext4_get_group_info(sb, i);
if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
desc_count += ext4_free_group_clusters(sb, gdp);
}
return desc_count;
#endif
}
static inline int test_root(ext4_group_t a, int b)
{
while (1) {
if (a < b)
return 0;
if (a == b)
return 1;
if ((a % b) != 0)
return 0;
a = a / b;
}
}
static int ext4_group_sparse(ext4_group_t group)
{
if (group <= 1)
return 1;
if (!(group & 1))
return 0;
return (test_root(group, 7) || test_root(group, 5) ||
test_root(group, 3));
}
/**
* ext4_bg_has_super - number of blocks used by the superblock in group
* @sb: superblock for filesystem
* @group: group number to check
*
* Return the number of blocks used by the superblock (primary or backup)
* in this group. Currently this will be only 0 or 1.
*/
int ext4_bg_has_super(struct super_block *sb, ext4_group_t group)
{
if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
!ext4_group_sparse(group))
return 0;
return 1;
}
static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
ext4_group_t group)
{
unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
ext4_group_t first = metagroup * EXT4_DESC_PER_BLOCK(sb);
ext4_group_t last = first + EXT4_DESC_PER_BLOCK(sb) - 1;
if (group == first || group == first + 1 || group == last)
return 1;
return 0;
}
static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
ext4_group_t group)
{
if (!ext4_bg_has_super(sb, group))
return 0;
if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG))
return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
else
return EXT4_SB(sb)->s_gdb_count;
}
/**
* ext4_bg_num_gdb - number of blocks used by the group table in group
* @sb: superblock for filesystem
* @group: group number to check
*
* Return the number of blocks used by the group descriptor table
* (primary or backup) in this group. In the future there may be a
* different number of descriptor blocks in each group.
*/
unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
{
unsigned long first_meta_bg =
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) ||
metagroup < first_meta_bg)
return ext4_bg_num_gdb_nometa(sb, group);
return ext4_bg_num_gdb_meta(sb,group);
}
/*
* This function returns the number of file system metadata clusters at
* the beginning of a block group, including the reserved gdt blocks.
*/
static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
ext4_group_t block_group)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
unsigned num;
/* Check for superblock and gdt backups in this group */
num = ext4_bg_has_super(sb, block_group);
if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
sbi->s_desc_per_block) {
if (num) {
num += ext4_bg_num_gdb(sb, block_group);
num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
}
} else { /* For META_BG_BLOCK_GROUPS */
num += ext4_bg_num_gdb(sb, block_group);
}
return EXT4_NUM_B2C(sbi, num);
}
/**
* ext4_inode_to_goal_block - return a hint for block allocation
* @inode: inode for block allocation
*
* Return the ideal location to start allocating blocks for a
* newly created inode.
*/
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
ext4_group_t block_group;
ext4_grpblk_t colour;
int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
ext4_fsblk_t bg_start;
ext4_fsblk_t last_block;
block_group = ei->i_block_group;
if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
/*
* If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
* block groups per flexgroup, reserve the first block
* group for directories and special files. Regular
* files will start at the second block group. This
* tends to speed up directory access and improves
* fsck times.
*/
block_group &= ~(flex_size-1);
if (S_ISREG(inode->i_mode))
block_group++;
}
bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
/*
* If we are doing delayed allocation, we don't need take
* colour into account.
*/
if (test_opt(inode->i_sb, DELALLOC))
return bg_start;
if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
colour = (current->pid % 16) *
(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
else
colour = (current->pid % 16) * ((last_block - bg_start) / 16);
return bg_start + colour;
}