mirror of
https://github.com/edk2-porting/linux-next.git
synced 2025-01-16 17:43:56 +08:00
4636e70bb0
Patch series "mm,dax: Fix data corruption due to mmap inconsistency",
v4.
This series fixes data corruption that can happen for DAX mounts when
page faults race with write(2) and as a result page tables get out of
sync with block mappings in the filesystem and thus data seen through
mmap is different from data seen through read(2).
The series passes testing with t_mmap_stale test program from Ross and
also other mmap related tests on DAX filesystem.
This patch (of 4):
dax_invalidate_mapping_entry() currently removes DAX exceptional entries
only if they are clean and unlocked. This is done via:
invalidate_mapping_pages()
invalidate_exceptional_entry()
dax_invalidate_mapping_entry()
However, for page cache pages removed in invalidate_mapping_pages()
there is an additional criteria which is that the page must not be
mapped. This is noted in the comments above invalidate_mapping_pages()
and is checked in invalidate_inode_page().
For DAX entries this means that we can can end up in a situation where a
DAX exceptional entry, either a huge zero page or a regular DAX entry,
could end up mapped but without an associated radix tree entry. This is
inconsistent with the rest of the DAX code and with what happens in the
page cache case.
We aren't able to unmap the DAX exceptional entry because according to
its comments invalidate_mapping_pages() isn't allowed to block, and
unmap_mapping_range() takes a write lock on the mapping->i_mmap_rwsem.
Since we essentially never have unmapped DAX entries to evict from the
radix tree, just remove dax_invalidate_mapping_entry().
Fixes: c6dcf52c23
("mm: Invalidate DAX radix tree entries only if appropriate")
Link: http://lkml.kernel.org/r/20170510085419.27601-2-jack@suse.cz
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Reported-by: Jan Kara <jack@suse.cz>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: <stable@vger.kernel.org> [4.10+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
113 lines
3.5 KiB
C
113 lines
3.5 KiB
C
#ifndef _LINUX_DAX_H
|
|
#define _LINUX_DAX_H
|
|
|
|
#include <linux/fs.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/radix-tree.h>
|
|
#include <asm/pgtable.h>
|
|
|
|
struct iomap_ops;
|
|
struct dax_device;
|
|
struct dax_operations {
|
|
/*
|
|
* direct_access: translate a device-relative
|
|
* logical-page-offset into an absolute physical pfn. Return the
|
|
* number of pages available for DAX at that pfn.
|
|
*/
|
|
long (*direct_access)(struct dax_device *, pgoff_t, long,
|
|
void **, pfn_t *);
|
|
};
|
|
|
|
int dax_read_lock(void);
|
|
void dax_read_unlock(int id);
|
|
struct dax_device *dax_get_by_host(const char *host);
|
|
struct dax_device *alloc_dax(void *private, const char *host,
|
|
const struct dax_operations *ops);
|
|
void put_dax(struct dax_device *dax_dev);
|
|
bool dax_alive(struct dax_device *dax_dev);
|
|
void kill_dax(struct dax_device *dax_dev);
|
|
void *dax_get_private(struct dax_device *dax_dev);
|
|
long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
|
|
void **kaddr, pfn_t *pfn);
|
|
|
|
/*
|
|
* We use lowest available bit in exceptional entry for locking, one bit for
|
|
* the entry size (PMD) and two more to tell us if the entry is a huge zero
|
|
* page (HZP) or an empty entry that is just used for locking. In total four
|
|
* special bits.
|
|
*
|
|
* If the PMD bit isn't set the entry has size PAGE_SIZE, and if the HZP and
|
|
* EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
|
|
* block allocation.
|
|
*/
|
|
#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
|
|
#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
|
|
#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
|
|
#define RADIX_DAX_HZP (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
|
|
#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
|
|
|
|
static inline unsigned long dax_radix_sector(void *entry)
|
|
{
|
|
return (unsigned long)entry >> RADIX_DAX_SHIFT;
|
|
}
|
|
|
|
static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
|
|
{
|
|
return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
|
|
((unsigned long)sector << RADIX_DAX_SHIFT) |
|
|
RADIX_DAX_ENTRY_LOCK);
|
|
}
|
|
|
|
ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
|
|
const struct iomap_ops *ops);
|
|
int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
|
|
const struct iomap_ops *ops);
|
|
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
|
|
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
|
|
pgoff_t index);
|
|
void dax_wake_mapping_entry_waiter(struct address_space *mapping,
|
|
pgoff_t index, void *entry, bool wake_all);
|
|
|
|
#ifdef CONFIG_FS_DAX
|
|
int __dax_zero_page_range(struct block_device *bdev,
|
|
struct dax_device *dax_dev, sector_t sector,
|
|
unsigned int offset, unsigned int length);
|
|
#else
|
|
static inline int __dax_zero_page_range(struct block_device *bdev,
|
|
struct dax_device *dax_dev, sector_t sector,
|
|
unsigned int offset, unsigned int length)
|
|
{
|
|
return -ENXIO;
|
|
}
|
|
#endif
|
|
|
|
#ifdef CONFIG_FS_DAX_PMD
|
|
static inline unsigned int dax_radix_order(void *entry)
|
|
{
|
|
if ((unsigned long)entry & RADIX_DAX_PMD)
|
|
return PMD_SHIFT - PAGE_SHIFT;
|
|
return 0;
|
|
}
|
|
#else
|
|
static inline unsigned int dax_radix_order(void *entry)
|
|
{
|
|
return 0;
|
|
}
|
|
#endif
|
|
int dax_pfn_mkwrite(struct vm_fault *vmf);
|
|
|
|
static inline bool vma_is_dax(struct vm_area_struct *vma)
|
|
{
|
|
return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
|
|
}
|
|
|
|
static inline bool dax_mapping(struct address_space *mapping)
|
|
{
|
|
return mapping->host && IS_DAX(mapping->host);
|
|
}
|
|
|
|
struct writeback_control;
|
|
int dax_writeback_mapping_range(struct address_space *mapping,
|
|
struct block_device *bdev, struct writeback_control *wbc);
|
|
#endif
|