2
0
mirror of https://github.com/edk2-porting/linux-next.git synced 2024-12-27 14:43:58 +08:00
linux-next/fs/xfs/xfs_dir2_readdir.c

687 lines
18 KiB
C
Raw Normal View History

/*
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
* Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it would be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_bmap.h"
#include "xfs_trans.h"
xfs: Add read-only support for dirent filetype field Add support for the file type field in directory entries so that readdir can return the type of the inode the dirent points to to userspace without first having to read the inode off disk. The encoding of the type field is a single byte that is added to the end of the directory entry name length. For all intents and purposes, it appends a "hidden" byte to the name field which contains the type information. As the directory entry is already of dynamic size, helpers are already required to access and decode the direct entry structures. Hence the relevent extraction and iteration helpers are updated to understand the hidden byte. Helpers for reading and writing the filetype field from the directory entries are also added. Only the read helpers are used by this patch. It also adds all the code necessary to read the type information out of the dirents on disk. Further we add the superblock feature bit and helpers to indicate that we understand the on-disk format change. This is not a compatible change - existing kernels cannot read the new format successfully - so an incompatible feature flag is added. We don't yet allow filesystems to mount with this flag yet - that will be added once write support is added. Finally, the code to take the type from the VFS, convert it to an XFS on-disk type and put it into the xfs_name structures passed around is added, but the directory code does not use this field yet. That will be in the next patch. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
2013-08-12 18:50:09 +08:00
/*
* Directory file type support functions
*/
static unsigned char xfs_dir3_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK,
DT_FIFO, DT_SOCK, DT_LNK, DT_WHT,
};
static unsigned char
xfs: Add read-only support for dirent filetype field Add support for the file type field in directory entries so that readdir can return the type of the inode the dirent points to to userspace without first having to read the inode off disk. The encoding of the type field is a single byte that is added to the end of the directory entry name length. For all intents and purposes, it appends a "hidden" byte to the name field which contains the type information. As the directory entry is already of dynamic size, helpers are already required to access and decode the direct entry structures. Hence the relevent extraction and iteration helpers are updated to understand the hidden byte. Helpers for reading and writing the filetype field from the directory entries are also added. Only the read helpers are used by this patch. It also adds all the code necessary to read the type information out of the dirents on disk. Further we add the superblock feature bit and helpers to indicate that we understand the on-disk format change. This is not a compatible change - existing kernels cannot read the new format successfully - so an incompatible feature flag is added. We don't yet allow filesystems to mount with this flag yet - that will be added once write support is added. Finally, the code to take the type from the VFS, convert it to an XFS on-disk type and put it into the xfs_name structures passed around is added, but the directory code does not use this field yet. That will be in the next patch. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
2013-08-12 18:50:09 +08:00
xfs_dir3_get_dtype(
struct xfs_mount *mp,
__uint8_t filetype)
{
if (!xfs_sb_version_hasftype(&mp->m_sb))
return DT_UNKNOWN;
if (filetype >= XFS_DIR3_FT_MAX)
return DT_UNKNOWN;
return xfs_dir3_filetype_table[filetype];
}
STATIC int
xfs_dir2_sf_getdents(
struct xfs_da_args *args,
struct dir_context *ctx)
{
int i; /* shortform entry number */
struct xfs_inode *dp = args->dp; /* incore directory inode */
xfs_dir2_dataptr_t off; /* current entry's offset */
xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
xfs_dir2_dataptr_t dot_offset;
xfs_dir2_dataptr_t dotdot_offset;
xfs_ino_t ino;
struct xfs_da_geometry *geo = args->geo;
ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
ASSERT(dp->i_df.if_u1.if_data != NULL);
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
/*
* If the block number in the offset is out of range, we're done.
*/
if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk)
return 0;
/*
* Precalculate offsets for . and .. as we will always need them.
*
* XXX(hch): the second argument is sometimes 0 and sometimes
* geo->datablk
*/
dot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
dp->d_ops->data_dot_offset);
dotdot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
dp->d_ops->data_dotdot_offset);
/*
* Put . entry unless we're starting past it.
*/
if (ctx->pos <= dot_offset) {
ctx->pos = dot_offset & 0x7fffffff;
if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR))
return 0;
}
/*
* Put .. entry unless we're starting past it.
*/
if (ctx->pos <= dotdot_offset) {
ino = dp->d_ops->sf_get_parent_ino(sfp);
ctx->pos = dotdot_offset & 0x7fffffff;
if (!dir_emit(ctx, "..", 2, ino, DT_DIR))
return 0;
}
/*
* Loop while there are more entries and put'ing works.
*/
sfep = xfs_dir2_sf_firstentry(sfp);
for (i = 0; i < sfp->count; i++) {
xfs: Add read-only support for dirent filetype field Add support for the file type field in directory entries so that readdir can return the type of the inode the dirent points to to userspace without first having to read the inode off disk. The encoding of the type field is a single byte that is added to the end of the directory entry name length. For all intents and purposes, it appends a "hidden" byte to the name field which contains the type information. As the directory entry is already of dynamic size, helpers are already required to access and decode the direct entry structures. Hence the relevent extraction and iteration helpers are updated to understand the hidden byte. Helpers for reading and writing the filetype field from the directory entries are also added. Only the read helpers are used by this patch. It also adds all the code necessary to read the type information out of the dirents on disk. Further we add the superblock feature bit and helpers to indicate that we understand the on-disk format change. This is not a compatible change - existing kernels cannot read the new format successfully - so an incompatible feature flag is added. We don't yet allow filesystems to mount with this flag yet - that will be added once write support is added. Finally, the code to take the type from the VFS, convert it to an XFS on-disk type and put it into the xfs_name structures passed around is added, but the directory code does not use this field yet. That will be in the next patch. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
2013-08-12 18:50:09 +08:00
__uint8_t filetype;
off = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
xfs_dir2_sf_get_offset(sfep));
if (ctx->pos > off) {
xfs: abstract the differences in dir2/dir3 via an ops vector Lots of the dir code now goes through switches to determine what is the correct on-disk format to parse. It generally involves a "xfs_sbversion_hasfoo" check, deferencing the superblock version and feature fields and hence touching several cache lines per operation in the process. Some operations do multiple checks because they nest conditional operations and they don't pass the information in a direct fashion between each other. Hence, add an ops vector to the xfs_inode structure that is configured when the inode is initialised to point to all the correct decode and encoding operations. This will significantly reduce the branchiness and cacheline footprint of the directory object decoding and encoding. This is the first patch in a series of conversion patches. It will introduce the ops structure, the setup of it and add the first operation to the vector. Subsequent patches will convert directory ops one at a time to keep the changes simple and obvious. Just this patch shows the benefit of such an approach on code size. Just converting the two shortform dir operations as this patch does decreases the built binary size by ~1500 bytes: $ size fs/xfs/xfs.o.orig fs/xfs/xfs.o.p1 text data bss dec hex filename 794490 96802 1096 892388 d9de4 fs/xfs/xfs.o.orig 792986 96802 1096 890884 d9804 fs/xfs/xfs.o.p1 $ That's a significant decrease in the instruction cache footprint of the directory code for such a simple change, and indicates that this approach is definitely worth pursuing further. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Ben Myers <bpm@sgi.com>
2013-10-29 19:11:46 +08:00
sfep = dp->d_ops->sf_nextentry(sfp, sfep);
continue;
}
ino = dp->d_ops->sf_get_ino(sfp, sfep);
filetype = dp->d_ops->sf_get_ftype(sfep);
ctx->pos = off & 0x7fffffff;
xfs: Add read-only support for dirent filetype field Add support for the file type field in directory entries so that readdir can return the type of the inode the dirent points to to userspace without first having to read the inode off disk. The encoding of the type field is a single byte that is added to the end of the directory entry name length. For all intents and purposes, it appends a "hidden" byte to the name field which contains the type information. As the directory entry is already of dynamic size, helpers are already required to access and decode the direct entry structures. Hence the relevent extraction and iteration helpers are updated to understand the hidden byte. Helpers for reading and writing the filetype field from the directory entries are also added. Only the read helpers are used by this patch. It also adds all the code necessary to read the type information out of the dirents on disk. Further we add the superblock feature bit and helpers to indicate that we understand the on-disk format change. This is not a compatible change - existing kernels cannot read the new format successfully - so an incompatible feature flag is added. We don't yet allow filesystems to mount with this flag yet - that will be added once write support is added. Finally, the code to take the type from the VFS, convert it to an XFS on-disk type and put it into the xfs_name structures passed around is added, but the directory code does not use this field yet. That will be in the next patch. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
2013-08-12 18:50:09 +08:00
if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino,
xfs_dir3_get_dtype(dp->i_mount, filetype)))
return 0;
xfs: abstract the differences in dir2/dir3 via an ops vector Lots of the dir code now goes through switches to determine what is the correct on-disk format to parse. It generally involves a "xfs_sbversion_hasfoo" check, deferencing the superblock version and feature fields and hence touching several cache lines per operation in the process. Some operations do multiple checks because they nest conditional operations and they don't pass the information in a direct fashion between each other. Hence, add an ops vector to the xfs_inode structure that is configured when the inode is initialised to point to all the correct decode and encoding operations. This will significantly reduce the branchiness and cacheline footprint of the directory object decoding and encoding. This is the first patch in a series of conversion patches. It will introduce the ops structure, the setup of it and add the first operation to the vector. Subsequent patches will convert directory ops one at a time to keep the changes simple and obvious. Just this patch shows the benefit of such an approach on code size. Just converting the two shortform dir operations as this patch does decreases the built binary size by ~1500 bytes: $ size fs/xfs/xfs.o.orig fs/xfs/xfs.o.p1 text data bss dec hex filename 794490 96802 1096 892388 d9de4 fs/xfs/xfs.o.orig 792986 96802 1096 890884 d9804 fs/xfs/xfs.o.p1 $ That's a significant decrease in the instruction cache footprint of the directory code for such a simple change, and indicates that this approach is definitely worth pursuing further. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Ben Myers <bpm@sgi.com>
2013-10-29 19:11:46 +08:00
sfep = dp->d_ops->sf_nextentry(sfp, sfep);
}
ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) &
0x7fffffff;
return 0;
}
/*
* Readdir for block directories.
*/
STATIC int
xfs_dir2_block_getdents(
struct xfs_da_args *args,
struct dir_context *ctx)
{
struct xfs_inode *dp = args->dp; /* incore directory inode */
xfs_dir2_data_hdr_t *hdr; /* block header */
struct xfs_buf *bp; /* buffer for block */
xfs_dir2_block_tail_t *btp; /* block tail */
xfs_dir2_data_entry_t *dep; /* block data entry */
xfs_dir2_data_unused_t *dup; /* block unused entry */
char *endptr; /* end of the data entries */
int error; /* error return value */
char *ptr; /* current data entry */
int wantoff; /* starting block offset */
xfs_off_t cook;
struct xfs_da_geometry *geo = args->geo;
xfs: stop holding ILOCK over filldir callbacks The recent change to the readdir locking made in 40194ec ("xfs: reinstate the ilock in xfs_readdir") for CXFS directory sanity was probably the wrong thing to do. Deep in the readdir code we can take page faults in the filldir callback, and so taking a page fault while holding an inode ilock creates a new set of locking issues that lockdep warns all over the place about. The locking order for regular inodes w.r.t. page faults is io_lock -> pagefault -> mmap_sem -> ilock. The directory readdir code now triggers ilock -> page fault -> mmap_sem. While we cannot deadlock at this point, it inverts all the locking patterns that lockdep normally sees on XFS inodes, and so triggers lockdep. We worked around this with commit 93a8614 ("xfs: fix directory inode iolock lockdep false positive"), but that then just moved the lockdep warning to deeper in the page fault path and triggered on security inode locks. Fixing the shmem issue there just moved the lockdep reports somewhere else, and now we are getting false positives from filesystem freezing annotations getting confused. Further, if we enter memory reclaim in a readdir path, we now get lockdep warning about potential deadlocks because the ilock is held when we enter reclaim. This, again, is different to a regular file in that we never allow memory reclaim to run while holding the ilock for regular files. Hence lockdep now throws ilock->kmalloc->reclaim->ilock warnings. Basically, the problem is that the ilock is being used to protect the directory data and the inode metadata, whereas for a regular file the iolock protects the data and the ilock protects the metadata. From the VFS perspective, the i_mutex serialises all accesses to the directory data, and so not holding the ilock for readdir doesn't matter. The issue is that CXFS doesn't access directory data via the VFS, so it has no "data serialisaton" mechanism. Hence we need to hold the IOLOCK in the correct places to provide this low level directory data access serialisation. The ilock can then be used just when the extent list needs to be read, just like we do for regular files. The directory modification code can take the iolock exclusive when the ilock is also taken, and this then ensures that readdir is correct excluded while modifications are in progress. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2015-08-19 08:33:00 +08:00
int lock_mode;
/*
* If the block number in the offset is out of range, we're done.
*/
if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk)
return 0;
xfs: stop holding ILOCK over filldir callbacks The recent change to the readdir locking made in 40194ec ("xfs: reinstate the ilock in xfs_readdir") for CXFS directory sanity was probably the wrong thing to do. Deep in the readdir code we can take page faults in the filldir callback, and so taking a page fault while holding an inode ilock creates a new set of locking issues that lockdep warns all over the place about. The locking order for regular inodes w.r.t. page faults is io_lock -> pagefault -> mmap_sem -> ilock. The directory readdir code now triggers ilock -> page fault -> mmap_sem. While we cannot deadlock at this point, it inverts all the locking patterns that lockdep normally sees on XFS inodes, and so triggers lockdep. We worked around this with commit 93a8614 ("xfs: fix directory inode iolock lockdep false positive"), but that then just moved the lockdep warning to deeper in the page fault path and triggered on security inode locks. Fixing the shmem issue there just moved the lockdep reports somewhere else, and now we are getting false positives from filesystem freezing annotations getting confused. Further, if we enter memory reclaim in a readdir path, we now get lockdep warning about potential deadlocks because the ilock is held when we enter reclaim. This, again, is different to a regular file in that we never allow memory reclaim to run while holding the ilock for regular files. Hence lockdep now throws ilock->kmalloc->reclaim->ilock warnings. Basically, the problem is that the ilock is being used to protect the directory data and the inode metadata, whereas for a regular file the iolock protects the data and the ilock protects the metadata. From the VFS perspective, the i_mutex serialises all accesses to the directory data, and so not holding the ilock for readdir doesn't matter. The issue is that CXFS doesn't access directory data via the VFS, so it has no "data serialisaton" mechanism. Hence we need to hold the IOLOCK in the correct places to provide this low level directory data access serialisation. The ilock can then be used just when the extent list needs to be read, just like we do for regular files. The directory modification code can take the iolock exclusive when the ilock is also taken, and this then ensures that readdir is correct excluded while modifications are in progress. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2015-08-19 08:33:00 +08:00
lock_mode = xfs_ilock_data_map_shared(dp);
error = xfs_dir3_block_read(NULL, dp, &bp);
xfs: stop holding ILOCK over filldir callbacks The recent change to the readdir locking made in 40194ec ("xfs: reinstate the ilock in xfs_readdir") for CXFS directory sanity was probably the wrong thing to do. Deep in the readdir code we can take page faults in the filldir callback, and so taking a page fault while holding an inode ilock creates a new set of locking issues that lockdep warns all over the place about. The locking order for regular inodes w.r.t. page faults is io_lock -> pagefault -> mmap_sem -> ilock. The directory readdir code now triggers ilock -> page fault -> mmap_sem. While we cannot deadlock at this point, it inverts all the locking patterns that lockdep normally sees on XFS inodes, and so triggers lockdep. We worked around this with commit 93a8614 ("xfs: fix directory inode iolock lockdep false positive"), but that then just moved the lockdep warning to deeper in the page fault path and triggered on security inode locks. Fixing the shmem issue there just moved the lockdep reports somewhere else, and now we are getting false positives from filesystem freezing annotations getting confused. Further, if we enter memory reclaim in a readdir path, we now get lockdep warning about potential deadlocks because the ilock is held when we enter reclaim. This, again, is different to a regular file in that we never allow memory reclaim to run while holding the ilock for regular files. Hence lockdep now throws ilock->kmalloc->reclaim->ilock warnings. Basically, the problem is that the ilock is being used to protect the directory data and the inode metadata, whereas for a regular file the iolock protects the data and the ilock protects the metadata. From the VFS perspective, the i_mutex serialises all accesses to the directory data, and so not holding the ilock for readdir doesn't matter. The issue is that CXFS doesn't access directory data via the VFS, so it has no "data serialisaton" mechanism. Hence we need to hold the IOLOCK in the correct places to provide this low level directory data access serialisation. The ilock can then be used just when the extent list needs to be read, just like we do for regular files. The directory modification code can take the iolock exclusive when the ilock is also taken, and this then ensures that readdir is correct excluded while modifications are in progress. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2015-08-19 08:33:00 +08:00
xfs_iunlock(dp, lock_mode);
if (error)
return error;
/*
* Extract the byte offset we start at from the seek pointer.
* We'll skip entries before this.
*/
wantoff = xfs_dir2_dataptr_to_off(geo, ctx->pos);
hdr = bp->b_addr;
xfs_dir3_data_check(dp, bp);
/*
* Set up values for the loop.
*/
btp = xfs_dir2_block_tail_p(geo, hdr);
ptr = (char *)dp->d_ops->data_entry_p(hdr);
endptr = (char *)xfs_dir2_block_leaf_p(btp);
/*
* Loop over the data portion of the block.
* Each object is a real entry (dep) or an unused one (dup).
*/
while (ptr < endptr) {
xfs: Add read-only support for dirent filetype field Add support for the file type field in directory entries so that readdir can return the type of the inode the dirent points to to userspace without first having to read the inode off disk. The encoding of the type field is a single byte that is added to the end of the directory entry name length. For all intents and purposes, it appends a "hidden" byte to the name field which contains the type information. As the directory entry is already of dynamic size, helpers are already required to access and decode the direct entry structures. Hence the relevent extraction and iteration helpers are updated to understand the hidden byte. Helpers for reading and writing the filetype field from the directory entries are also added. Only the read helpers are used by this patch. It also adds all the code necessary to read the type information out of the dirents on disk. Further we add the superblock feature bit and helpers to indicate that we understand the on-disk format change. This is not a compatible change - existing kernels cannot read the new format successfully - so an incompatible feature flag is added. We don't yet allow filesystems to mount with this flag yet - that will be added once write support is added. Finally, the code to take the type from the VFS, convert it to an XFS on-disk type and put it into the xfs_name structures passed around is added, but the directory code does not use this field yet. That will be in the next patch. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
2013-08-12 18:50:09 +08:00
__uint8_t filetype;
dup = (xfs_dir2_data_unused_t *)ptr;
/*
* Unused, skip it.
*/
if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
ptr += be16_to_cpu(dup->length);
continue;
}
dep = (xfs_dir2_data_entry_t *)ptr;
/*
* Bump pointer for the next iteration.
*/
ptr += dp->d_ops->data_entsize(dep->namelen);
/*
* The entry is before the desired starting point, skip it.
*/
if ((char *)dep - (char *)hdr < wantoff)
continue;
cook = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
(char *)dep - (char *)hdr);
ctx->pos = cook & 0x7fffffff;
filetype = dp->d_ops->data_get_ftype(dep);
/*
* If it didn't fit, set the final offset to here & return.
*/
if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
xfs: Add read-only support for dirent filetype field Add support for the file type field in directory entries so that readdir can return the type of the inode the dirent points to to userspace without first having to read the inode off disk. The encoding of the type field is a single byte that is added to the end of the directory entry name length. For all intents and purposes, it appends a "hidden" byte to the name field which contains the type information. As the directory entry is already of dynamic size, helpers are already required to access and decode the direct entry structures. Hence the relevent extraction and iteration helpers are updated to understand the hidden byte. Helpers for reading and writing the filetype field from the directory entries are also added. Only the read helpers are used by this patch. It also adds all the code necessary to read the type information out of the dirents on disk. Further we add the superblock feature bit and helpers to indicate that we understand the on-disk format change. This is not a compatible change - existing kernels cannot read the new format successfully - so an incompatible feature flag is added. We don't yet allow filesystems to mount with this flag yet - that will be added once write support is added. Finally, the code to take the type from the VFS, convert it to an XFS on-disk type and put it into the xfs_name structures passed around is added, but the directory code does not use this field yet. That will be in the next patch. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
2013-08-12 18:50:09 +08:00
be64_to_cpu(dep->inumber),
xfs_dir3_get_dtype(dp->i_mount, filetype))) {
xfs_trans_brelse(NULL, bp);
return 0;
}
}
/*
* Reached the end of the block.
* Set the offset to a non-existent block 1 and return.
*/
ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) &
0x7fffffff;
xfs_trans_brelse(NULL, bp);
return 0;
}
struct xfs_dir2_leaf_map_info {
xfs_extlen_t map_blocks; /* number of fsbs in map */
xfs_dablk_t map_off; /* last mapped file offset */
int map_size; /* total entries in *map */
int map_valid; /* valid entries in *map */
int nmap; /* mappings to ask xfs_bmapi */
xfs_dir2_db_t curdb; /* db for current block */
int ra_current; /* number of read-ahead blks */
int ra_index; /* *map index for read-ahead */
int ra_offset; /* map entry offset for ra */
int ra_want; /* readahead count wanted */
struct xfs_bmbt_irec map[]; /* map vector for blocks */
};
STATIC int
xfs_dir2_leaf_readbuf(
struct xfs_da_args *args,
size_t bufsize,
struct xfs_dir2_leaf_map_info *mip,
xfs_dir2_off_t *curoff,
struct xfs_buf **bpp,
bool trim_map)
{
struct xfs_inode *dp = args->dp;
struct xfs_buf *bp = NULL;
struct xfs_bmbt_irec *map = mip->map;
struct blk_plug plug;
int error = 0;
int length;
int i;
int j;
struct xfs_da_geometry *geo = args->geo;
/*
* If the caller just finished processing a buffer, it will tell us
* we need to trim that block out of the mapping now it is done.
*/
if (trim_map) {
mip->map_blocks -= geo->fsbcount;
/*
* Loop to get rid of the extents for the
* directory block.
*/
for (i = geo->fsbcount; i > 0; ) {
j = min_t(int, map->br_blockcount, i);
map->br_blockcount -= j;
map->br_startblock += j;
map->br_startoff += j;
/*
* If mapping is done, pitch it from
* the table.
*/
if (!map->br_blockcount && --mip->map_valid)
memmove(&map[0], &map[1],
sizeof(map[0]) * mip->map_valid);
i -= j;
}
}
/*
* Recalculate the readahead blocks wanted.
*/
mip->ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog)) - 1;
ASSERT(mip->ra_want >= 0);
/*
* If we don't have as many as we want, and we haven't
* run out of data blocks, get some more mappings.
*/
if (1 + mip->ra_want > mip->map_blocks &&
mip->map_off < xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET)) {
/*
* Get more bmaps, fill in after the ones
* we already have in the table.
*/
mip->nmap = mip->map_size - mip->map_valid;
error = xfs_bmapi_read(dp, mip->map_off,
xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET) -
mip->map_off,
&map[mip->map_valid], &mip->nmap, 0);
/*
* Don't know if we should ignore this or try to return an
* error. The trouble with returning errors is that readdir
* will just stop without actually passing the error through.
*/
if (error)
goto out; /* XXX */
/*
* If we got all the mappings we asked for, set the final map
* offset based on the last bmap value received. Otherwise,
* we've reached the end.
*/
if (mip->nmap == mip->map_size - mip->map_valid) {
i = mip->map_valid + mip->nmap - 1;
mip->map_off = map[i].br_startoff + map[i].br_blockcount;
} else
mip->map_off = xfs_dir2_byte_to_da(geo,
XFS_DIR2_LEAF_OFFSET);
/*
* Look for holes in the mapping, and eliminate them. Count up
* the valid blocks.
*/
for (i = mip->map_valid; i < mip->map_valid + mip->nmap; ) {
if (map[i].br_startblock == HOLESTARTBLOCK) {
mip->nmap--;
length = mip->map_valid + mip->nmap - i;
if (length)
memmove(&map[i], &map[i + 1],
sizeof(map[i]) * length);
} else {
mip->map_blocks += map[i].br_blockcount;
i++;
}
}
mip->map_valid += mip->nmap;
}
/*
* No valid mappings, so no more data blocks.
*/
if (!mip->map_valid) {
*curoff = xfs_dir2_da_to_byte(geo, mip->map_off);
goto out;
}
/*
* Read the directory block starting at the first mapping.
*/
mip->curdb = xfs_dir2_da_to_db(geo, map->br_startoff);
error = xfs_dir3_data_read(NULL, dp, map->br_startoff,
map->br_blockcount >= geo->fsbcount ?
XFS_FSB_TO_DADDR(dp->i_mount, map->br_startblock) :
-1, &bp);
/*
* Should just skip over the data block instead of giving up.
*/
if (error)
goto out; /* XXX */
/*
* Adjust the current amount of read-ahead: we just read a block that
* was previously ra.
*/
if (mip->ra_current)
mip->ra_current -= geo->fsbcount;
/*
* Do we need more readahead?
xfs: handle array index overrun in xfs_dir2_leaf_readbuf() Carlos had a case where "find" seemed to start spinning forever and never return. This was on a filesystem with non-default multi-fsb (8k) directory blocks, and a fragmented directory with extents like this: 0:[0,133646,2,0] 1:[2,195888,1,0] 2:[3,195890,1,0] 3:[4,195892,1,0] 4:[5,195894,1,0] 5:[6,195896,1,0] 6:[7,195898,1,0] 7:[8,195900,1,0] 8:[9,195902,1,0] 9:[10,195908,1,0] 10:[11,195910,1,0] 11:[12,195912,1,0] 12:[13,195914,1,0] ... i.e. the first extent is a contiguous 2-fsb dir block, but after that it is fragmented into 1 block extents. At the top of the readdir path, we allocate a mapping array which (for this filesystem geometry) can hold 10 extents; see the assignment to map_info->map_size. During readdir, we are therefore able to map extents 0 through 9 above into the array for readahead purposes. If we count by 2, we see that the last mapped index (9) is the first block of a 2-fsb directory block. At the end of xfs_dir2_leaf_readbuf() we have 2 loops to fill more readahead; the outer loop assumes one full dir block is processed each loop iteration, and an inner loop that ensures that this is so by advancing to the next extent until a full directory block is mapped. The problem is that this inner loop may step past the last extent in the mapping array as it tries to reach the end of the directory block. This will read garbage for the extent length, and as a result the loop control variable 'j' may become corrupted and never fail the loop conditional. The number of valid mappings we have in our array is stored in map->map_valid, so stop this inner loop based on that limit. There is an ASSERT at the top of the outer loop for this same condition, but we never made it out of the inner loop, so the ASSERT never fired. Huge appreciation for Carlos for debugging and isolating the problem. Debugged-and-analyzed-by: Carlos Maiolino <cmaiolino@redhat.com> Signed-off-by: Eric Sandeen <sandeen@redhat.com> Tested-by: Carlos Maiolino <cmaiolino@redhat.com> Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com> Reviewed-by: Bill O'Donnell <billodo@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2017-04-14 06:15:47 +08:00
* Each loop tries to process 1 full dir blk; last may be partial.
*/
blk_start_plug(&plug);
for (mip->ra_index = mip->ra_offset = i = 0;
mip->ra_want > mip->ra_current && i < mip->map_blocks;
i += geo->fsbcount) {
ASSERT(mip->ra_index < mip->map_valid);
/*
* Read-ahead a contiguous directory block.
*/
if (i > mip->ra_current &&
xfs: prevent multi-fsb dir readahead from reading random blocks Directory block readahead uses a complex iteration mechanism to map between high-level directory blocks and underlying physical extents. This mechanism attempts to traverse the higher-level dir blocks in a manner that handles multi-fsb directory blocks and simultaneously maintains a reference to the corresponding physical blocks. This logic doesn't handle certain (discontiguous) physical extent layouts correctly with multi-fsb directory blocks. For example, consider the case of a 4k FSB filesystem with a 2 FSB (8k) directory block size and a directory with the following extent layout: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL 0: [0..7]: 88..95 0 (88..95) 8 1: [8..15]: 80..87 0 (80..87) 8 2: [16..39]: 168..191 0 (168..191) 24 3: [40..63]: 5242952..5242975 1 (72..95) 24 Directory block 0 spans physical extents 0 and 1, dirblk 1 lies entirely within extent 2 and dirblk 2 spans extents 2 and 3. Because extent 2 is larger than the directory block size, the readahead code erroneously assumes the block is contiguous and issues a readahead based on the physical mapping of the first fsb of the dirblk. This results in read verifier failure and a spurious corruption or crc failure, depending on the filesystem format. Further, the subsequent readahead code responsible for walking through the physical table doesn't correctly advance the physical block reference for dirblk 2. Instead of advancing two physical filesystem blocks, the first iteration of the loop advances 1 block (correctly), but the subsequent iteration advances 2 more physical blocks because the next physical extent (extent 3, above) happens to cover more than dirblk 2. At this point, the higher-level directory block walking is completely off the rails of the actual physical layout of the directory for the respective mapping table. Update the contiguous dirblock logic to consider the current offset in the physical extent to avoid issuing directory readahead to unrelated blocks. Also, update the mapping table advancing code to consider the current offset within the current dirblock to avoid advancing the mapping reference too far beyond the dirblock. Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2017-04-20 23:06:47 +08:00
(map[mip->ra_index].br_blockcount - mip->ra_offset) >=
geo->fsbcount) {
xfs_dir3_data_readahead(dp,
map[mip->ra_index].br_startoff + mip->ra_offset,
XFS_FSB_TO_DADDR(dp->i_mount,
map[mip->ra_index].br_startblock +
mip->ra_offset));
mip->ra_current = i;
}
/*
* Read-ahead a non-contiguous directory block. This doesn't
* use our mapping, but this is a very rare case.
*/
else if (i > mip->ra_current) {
xfs_dir3_data_readahead(dp,
map[mip->ra_index].br_startoff +
mip->ra_offset, -1);
mip->ra_current = i;
}
/*
xfs: handle array index overrun in xfs_dir2_leaf_readbuf() Carlos had a case where "find" seemed to start spinning forever and never return. This was on a filesystem with non-default multi-fsb (8k) directory blocks, and a fragmented directory with extents like this: 0:[0,133646,2,0] 1:[2,195888,1,0] 2:[3,195890,1,0] 3:[4,195892,1,0] 4:[5,195894,1,0] 5:[6,195896,1,0] 6:[7,195898,1,0] 7:[8,195900,1,0] 8:[9,195902,1,0] 9:[10,195908,1,0] 10:[11,195910,1,0] 11:[12,195912,1,0] 12:[13,195914,1,0] ... i.e. the first extent is a contiguous 2-fsb dir block, but after that it is fragmented into 1 block extents. At the top of the readdir path, we allocate a mapping array which (for this filesystem geometry) can hold 10 extents; see the assignment to map_info->map_size. During readdir, we are therefore able to map extents 0 through 9 above into the array for readahead purposes. If we count by 2, we see that the last mapped index (9) is the first block of a 2-fsb directory block. At the end of xfs_dir2_leaf_readbuf() we have 2 loops to fill more readahead; the outer loop assumes one full dir block is processed each loop iteration, and an inner loop that ensures that this is so by advancing to the next extent until a full directory block is mapped. The problem is that this inner loop may step past the last extent in the mapping array as it tries to reach the end of the directory block. This will read garbage for the extent length, and as a result the loop control variable 'j' may become corrupted and never fail the loop conditional. The number of valid mappings we have in our array is stored in map->map_valid, so stop this inner loop based on that limit. There is an ASSERT at the top of the outer loop for this same condition, but we never made it out of the inner loop, so the ASSERT never fired. Huge appreciation for Carlos for debugging and isolating the problem. Debugged-and-analyzed-by: Carlos Maiolino <cmaiolino@redhat.com> Signed-off-by: Eric Sandeen <sandeen@redhat.com> Tested-by: Carlos Maiolino <cmaiolino@redhat.com> Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com> Reviewed-by: Bill O'Donnell <billodo@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2017-04-14 06:15:47 +08:00
* Advance offset through the mapping table, processing a full
* dir block even if it is fragmented into several extents.
* But stop if we have consumed all valid mappings, even if
* it's not yet a full directory block.
*/
xfs: handle array index overrun in xfs_dir2_leaf_readbuf() Carlos had a case where "find" seemed to start spinning forever and never return. This was on a filesystem with non-default multi-fsb (8k) directory blocks, and a fragmented directory with extents like this: 0:[0,133646,2,0] 1:[2,195888,1,0] 2:[3,195890,1,0] 3:[4,195892,1,0] 4:[5,195894,1,0] 5:[6,195896,1,0] 6:[7,195898,1,0] 7:[8,195900,1,0] 8:[9,195902,1,0] 9:[10,195908,1,0] 10:[11,195910,1,0] 11:[12,195912,1,0] 12:[13,195914,1,0] ... i.e. the first extent is a contiguous 2-fsb dir block, but after that it is fragmented into 1 block extents. At the top of the readdir path, we allocate a mapping array which (for this filesystem geometry) can hold 10 extents; see the assignment to map_info->map_size. During readdir, we are therefore able to map extents 0 through 9 above into the array for readahead purposes. If we count by 2, we see that the last mapped index (9) is the first block of a 2-fsb directory block. At the end of xfs_dir2_leaf_readbuf() we have 2 loops to fill more readahead; the outer loop assumes one full dir block is processed each loop iteration, and an inner loop that ensures that this is so by advancing to the next extent until a full directory block is mapped. The problem is that this inner loop may step past the last extent in the mapping array as it tries to reach the end of the directory block. This will read garbage for the extent length, and as a result the loop control variable 'j' may become corrupted and never fail the loop conditional. The number of valid mappings we have in our array is stored in map->map_valid, so stop this inner loop based on that limit. There is an ASSERT at the top of the outer loop for this same condition, but we never made it out of the inner loop, so the ASSERT never fired. Huge appreciation for Carlos for debugging and isolating the problem. Debugged-and-analyzed-by: Carlos Maiolino <cmaiolino@redhat.com> Signed-off-by: Eric Sandeen <sandeen@redhat.com> Tested-by: Carlos Maiolino <cmaiolino@redhat.com> Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com> Reviewed-by: Bill O'Donnell <billodo@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2017-04-14 06:15:47 +08:00
for (j = 0;
j < geo->fsbcount && mip->ra_index < mip->map_valid;
j += length ) {
/*
* The rest of this extent but not more than a dir
* block.
*/
xfs: prevent multi-fsb dir readahead from reading random blocks Directory block readahead uses a complex iteration mechanism to map between high-level directory blocks and underlying physical extents. This mechanism attempts to traverse the higher-level dir blocks in a manner that handles multi-fsb directory blocks and simultaneously maintains a reference to the corresponding physical blocks. This logic doesn't handle certain (discontiguous) physical extent layouts correctly with multi-fsb directory blocks. For example, consider the case of a 4k FSB filesystem with a 2 FSB (8k) directory block size and a directory with the following extent layout: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL 0: [0..7]: 88..95 0 (88..95) 8 1: [8..15]: 80..87 0 (80..87) 8 2: [16..39]: 168..191 0 (168..191) 24 3: [40..63]: 5242952..5242975 1 (72..95) 24 Directory block 0 spans physical extents 0 and 1, dirblk 1 lies entirely within extent 2 and dirblk 2 spans extents 2 and 3. Because extent 2 is larger than the directory block size, the readahead code erroneously assumes the block is contiguous and issues a readahead based on the physical mapping of the first fsb of the dirblk. This results in read verifier failure and a spurious corruption or crc failure, depending on the filesystem format. Further, the subsequent readahead code responsible for walking through the physical table doesn't correctly advance the physical block reference for dirblk 2. Instead of advancing two physical filesystem blocks, the first iteration of the loop advances 1 block (correctly), but the subsequent iteration advances 2 more physical blocks because the next physical extent (extent 3, above) happens to cover more than dirblk 2. At this point, the higher-level directory block walking is completely off the rails of the actual physical layout of the directory for the respective mapping table. Update the contiguous dirblock logic to consider the current offset in the physical extent to avoid issuing directory readahead to unrelated blocks. Also, update the mapping table advancing code to consider the current offset within the current dirblock to avoid advancing the mapping reference too far beyond the dirblock. Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2017-04-20 23:06:47 +08:00
length = min_t(int, geo->fsbcount - j,
map[mip->ra_index].br_blockcount -
mip->ra_offset);
mip->ra_offset += length;
/*
* Advance to the next mapping if this one is used up.
*/
if (mip->ra_offset == map[mip->ra_index].br_blockcount) {
mip->ra_offset = 0;
mip->ra_index++;
}
}
}
blk_finish_plug(&plug);
out:
*bpp = bp;
return error;
}
/*
* Getdents (readdir) for leaf and node directories.
* This reads the data blocks only, so is the same for both forms.
*/
STATIC int
xfs_dir2_leaf_getdents(
struct xfs_da_args *args,
struct dir_context *ctx,
size_t bufsize)
{
struct xfs_inode *dp = args->dp;
struct xfs_buf *bp = NULL; /* data block buffer */
xfs_dir2_data_hdr_t *hdr; /* data block header */
xfs_dir2_data_entry_t *dep; /* data entry */
xfs_dir2_data_unused_t *dup; /* unused entry */
int error = 0; /* error return value */
int length; /* temporary length value */
int byteoff; /* offset in current block */
xfs_dir2_off_t curoff; /* current overall offset */
xfs_dir2_off_t newoff; /* new curoff after new blk */
char *ptr = NULL; /* pointer to current data */
struct xfs_dir2_leaf_map_info *map_info;
struct xfs_da_geometry *geo = args->geo;
/*
* If the offset is at or past the largest allowed value,
* give up right away.
*/
if (ctx->pos >= XFS_DIR2_MAX_DATAPTR)
return 0;
/*
* Set up to bmap a number of blocks based on the caller's
* buffer size, the directory block size, and the filesystem
* block size.
*/
length = howmany(bufsize + geo->blksize, (1 << geo->fsblog));
map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
(length * sizeof(struct xfs_bmbt_irec)),
KM_SLEEP | KM_NOFS);
map_info->map_size = length;
/*
* Inside the loop we keep the main offset value as a byte offset
* in the directory file.
*/
curoff = xfs_dir2_dataptr_to_byte(ctx->pos);
/*
* Force this conversion through db so we truncate the offset
* down to get the start of the data block.
*/
map_info->map_off = xfs_dir2_db_to_da(geo,
xfs_dir2_byte_to_db(geo, curoff));
/*
* Loop over directory entries until we reach the end offset.
* Get more blocks and readahead as necessary.
*/
while (curoff < XFS_DIR2_LEAF_OFFSET) {
xfs: Add read-only support for dirent filetype field Add support for the file type field in directory entries so that readdir can return the type of the inode the dirent points to to userspace without first having to read the inode off disk. The encoding of the type field is a single byte that is added to the end of the directory entry name length. For all intents and purposes, it appends a "hidden" byte to the name field which contains the type information. As the directory entry is already of dynamic size, helpers are already required to access and decode the direct entry structures. Hence the relevent extraction and iteration helpers are updated to understand the hidden byte. Helpers for reading and writing the filetype field from the directory entries are also added. Only the read helpers are used by this patch. It also adds all the code necessary to read the type information out of the dirents on disk. Further we add the superblock feature bit and helpers to indicate that we understand the on-disk format change. This is not a compatible change - existing kernels cannot read the new format successfully - so an incompatible feature flag is added. We don't yet allow filesystems to mount with this flag yet - that will be added once write support is added. Finally, the code to take the type from the VFS, convert it to an XFS on-disk type and put it into the xfs_name structures passed around is added, but the directory code does not use this field yet. That will be in the next patch. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
2013-08-12 18:50:09 +08:00
__uint8_t filetype;
/*
* If we have no buffer, or we're off the end of the
* current buffer, need to get another one.
*/
if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) {
xfs: stop holding ILOCK over filldir callbacks The recent change to the readdir locking made in 40194ec ("xfs: reinstate the ilock in xfs_readdir") for CXFS directory sanity was probably the wrong thing to do. Deep in the readdir code we can take page faults in the filldir callback, and so taking a page fault while holding an inode ilock creates a new set of locking issues that lockdep warns all over the place about. The locking order for regular inodes w.r.t. page faults is io_lock -> pagefault -> mmap_sem -> ilock. The directory readdir code now triggers ilock -> page fault -> mmap_sem. While we cannot deadlock at this point, it inverts all the locking patterns that lockdep normally sees on XFS inodes, and so triggers lockdep. We worked around this with commit 93a8614 ("xfs: fix directory inode iolock lockdep false positive"), but that then just moved the lockdep warning to deeper in the page fault path and triggered on security inode locks. Fixing the shmem issue there just moved the lockdep reports somewhere else, and now we are getting false positives from filesystem freezing annotations getting confused. Further, if we enter memory reclaim in a readdir path, we now get lockdep warning about potential deadlocks because the ilock is held when we enter reclaim. This, again, is different to a regular file in that we never allow memory reclaim to run while holding the ilock for regular files. Hence lockdep now throws ilock->kmalloc->reclaim->ilock warnings. Basically, the problem is that the ilock is being used to protect the directory data and the inode metadata, whereas for a regular file the iolock protects the data and the ilock protects the metadata. From the VFS perspective, the i_mutex serialises all accesses to the directory data, and so not holding the ilock for readdir doesn't matter. The issue is that CXFS doesn't access directory data via the VFS, so it has no "data serialisaton" mechanism. Hence we need to hold the IOLOCK in the correct places to provide this low level directory data access serialisation. The ilock can then be used just when the extent list needs to be read, just like we do for regular files. The directory modification code can take the iolock exclusive when the ilock is also taken, and this then ensures that readdir is correct excluded while modifications are in progress. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2015-08-19 08:33:00 +08:00
int lock_mode;
bool trim_map = false;
if (bp) {
xfs_trans_brelse(NULL, bp);
bp = NULL;
trim_map = true;
}
xfs: stop holding ILOCK over filldir callbacks The recent change to the readdir locking made in 40194ec ("xfs: reinstate the ilock in xfs_readdir") for CXFS directory sanity was probably the wrong thing to do. Deep in the readdir code we can take page faults in the filldir callback, and so taking a page fault while holding an inode ilock creates a new set of locking issues that lockdep warns all over the place about. The locking order for regular inodes w.r.t. page faults is io_lock -> pagefault -> mmap_sem -> ilock. The directory readdir code now triggers ilock -> page fault -> mmap_sem. While we cannot deadlock at this point, it inverts all the locking patterns that lockdep normally sees on XFS inodes, and so triggers lockdep. We worked around this with commit 93a8614 ("xfs: fix directory inode iolock lockdep false positive"), but that then just moved the lockdep warning to deeper in the page fault path and triggered on security inode locks. Fixing the shmem issue there just moved the lockdep reports somewhere else, and now we are getting false positives from filesystem freezing annotations getting confused. Further, if we enter memory reclaim in a readdir path, we now get lockdep warning about potential deadlocks because the ilock is held when we enter reclaim. This, again, is different to a regular file in that we never allow memory reclaim to run while holding the ilock for regular files. Hence lockdep now throws ilock->kmalloc->reclaim->ilock warnings. Basically, the problem is that the ilock is being used to protect the directory data and the inode metadata, whereas for a regular file the iolock protects the data and the ilock protects the metadata. From the VFS perspective, the i_mutex serialises all accesses to the directory data, and so not holding the ilock for readdir doesn't matter. The issue is that CXFS doesn't access directory data via the VFS, so it has no "data serialisaton" mechanism. Hence we need to hold the IOLOCK in the correct places to provide this low level directory data access serialisation. The ilock can then be used just when the extent list needs to be read, just like we do for regular files. The directory modification code can take the iolock exclusive when the ilock is also taken, and this then ensures that readdir is correct excluded while modifications are in progress. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2015-08-19 08:33:00 +08:00
lock_mode = xfs_ilock_data_map_shared(dp);
error = xfs_dir2_leaf_readbuf(args, bufsize, map_info,
&curoff, &bp, trim_map);
xfs: stop holding ILOCK over filldir callbacks The recent change to the readdir locking made in 40194ec ("xfs: reinstate the ilock in xfs_readdir") for CXFS directory sanity was probably the wrong thing to do. Deep in the readdir code we can take page faults in the filldir callback, and so taking a page fault while holding an inode ilock creates a new set of locking issues that lockdep warns all over the place about. The locking order for regular inodes w.r.t. page faults is io_lock -> pagefault -> mmap_sem -> ilock. The directory readdir code now triggers ilock -> page fault -> mmap_sem. While we cannot deadlock at this point, it inverts all the locking patterns that lockdep normally sees on XFS inodes, and so triggers lockdep. We worked around this with commit 93a8614 ("xfs: fix directory inode iolock lockdep false positive"), but that then just moved the lockdep warning to deeper in the page fault path and triggered on security inode locks. Fixing the shmem issue there just moved the lockdep reports somewhere else, and now we are getting false positives from filesystem freezing annotations getting confused. Further, if we enter memory reclaim in a readdir path, we now get lockdep warning about potential deadlocks because the ilock is held when we enter reclaim. This, again, is different to a regular file in that we never allow memory reclaim to run while holding the ilock for regular files. Hence lockdep now throws ilock->kmalloc->reclaim->ilock warnings. Basically, the problem is that the ilock is being used to protect the directory data and the inode metadata, whereas for a regular file the iolock protects the data and the ilock protects the metadata. From the VFS perspective, the i_mutex serialises all accesses to the directory data, and so not holding the ilock for readdir doesn't matter. The issue is that CXFS doesn't access directory data via the VFS, so it has no "data serialisaton" mechanism. Hence we need to hold the IOLOCK in the correct places to provide this low level directory data access serialisation. The ilock can then be used just when the extent list needs to be read, just like we do for regular files. The directory modification code can take the iolock exclusive when the ilock is also taken, and this then ensures that readdir is correct excluded while modifications are in progress. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2015-08-19 08:33:00 +08:00
xfs_iunlock(dp, lock_mode);
if (error || !map_info->map_valid)
break;
/*
* Having done a read, we need to set a new offset.
*/
newoff = xfs_dir2_db_off_to_byte(geo,
map_info->curdb, 0);
/*
* Start of the current block.
*/
if (curoff < newoff)
curoff = newoff;
/*
* Make sure we're in the right block.
*/
else if (curoff > newoff)
ASSERT(xfs_dir2_byte_to_db(geo, curoff) ==
map_info->curdb);
hdr = bp->b_addr;
xfs_dir3_data_check(dp, bp);
/*
* Find our position in the block.
*/
ptr = (char *)dp->d_ops->data_entry_p(hdr);
byteoff = xfs_dir2_byte_to_off(geo, curoff);
/*
* Skip past the header.
*/
if (byteoff == 0)
curoff += dp->d_ops->data_entry_offset;
/*
* Skip past entries until we reach our offset.
*/
else {
while ((char *)ptr - (char *)hdr < byteoff) {
dup = (xfs_dir2_data_unused_t *)ptr;
if (be16_to_cpu(dup->freetag)
== XFS_DIR2_DATA_FREE_TAG) {
length = be16_to_cpu(dup->length);
ptr += length;
continue;
}
dep = (xfs_dir2_data_entry_t *)ptr;
length =
dp->d_ops->data_entsize(dep->namelen);
ptr += length;
}
/*
* Now set our real offset.
*/
curoff =
xfs_dir2_db_off_to_byte(geo,
xfs_dir2_byte_to_db(geo, curoff),
(char *)ptr - (char *)hdr);
if (ptr >= (char *)hdr + geo->blksize) {
continue;
}
}
}
/*
* We have a pointer to an entry.
* Is it a live one?
*/
dup = (xfs_dir2_data_unused_t *)ptr;
/*
* No, it's unused, skip over it.
*/
if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
length = be16_to_cpu(dup->length);
ptr += length;
curoff += length;
continue;
}
dep = (xfs_dir2_data_entry_t *)ptr;
length = dp->d_ops->data_entsize(dep->namelen);
filetype = dp->d_ops->data_get_ftype(dep);
ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
xfs: Add read-only support for dirent filetype field Add support for the file type field in directory entries so that readdir can return the type of the inode the dirent points to to userspace without first having to read the inode off disk. The encoding of the type field is a single byte that is added to the end of the directory entry name length. For all intents and purposes, it appends a "hidden" byte to the name field which contains the type information. As the directory entry is already of dynamic size, helpers are already required to access and decode the direct entry structures. Hence the relevent extraction and iteration helpers are updated to understand the hidden byte. Helpers for reading and writing the filetype field from the directory entries are also added. Only the read helpers are used by this patch. It also adds all the code necessary to read the type information out of the dirents on disk. Further we add the superblock feature bit and helpers to indicate that we understand the on-disk format change. This is not a compatible change - existing kernels cannot read the new format successfully - so an incompatible feature flag is added. We don't yet allow filesystems to mount with this flag yet - that will be added once write support is added. Finally, the code to take the type from the VFS, convert it to an XFS on-disk type and put it into the xfs_name structures passed around is added, but the directory code does not use this field yet. That will be in the next patch. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
2013-08-12 18:50:09 +08:00
be64_to_cpu(dep->inumber),
xfs_dir3_get_dtype(dp->i_mount, filetype)))
break;
/*
* Advance to next entry in the block.
*/
ptr += length;
curoff += length;
/* bufsize may have just been a guess; don't go negative */
bufsize = bufsize > length ? bufsize - length : 0;
}
/*
* All done. Set output offset value to current offset.
*/
if (curoff > xfs_dir2_dataptr_to_byte(XFS_DIR2_MAX_DATAPTR))
ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
else
ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
kmem_free(map_info);
if (bp)
xfs_trans_brelse(NULL, bp);
return error;
}
/*
* Read a directory.
*/
int
xfs_readdir(
struct xfs_inode *dp,
struct dir_context *ctx,
size_t bufsize)
{
struct xfs_da_args args = { NULL };
int rval;
int v;
trace_xfs_readdir(dp);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return -EIO;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
XFS_STATS_INC(dp->i_mount, xs_dir_getdents);
args.dp = dp;
args.geo = dp->i_mount->m_dir_geo;
if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
rval = xfs_dir2_sf_getdents(&args, ctx);
else if ((rval = xfs_dir2_isblock(&args, &v)))
;
else if (v)
rval = xfs_dir2_block_getdents(&args, ctx);
else
rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize);
return rval;
}